Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1449,21 +1449,15 @@ // \p VF is the vectorization factor that will be used to vectorize \p I. // Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I, ElementCount VF) { - // When we know the load's address is loop invariant and the instruction - // in the original scalar loop was unconditionally executed then we - // don't need to mark it as a predicated instruction. Tail folding may - // introduce additional predication, but we're guaranteed to always have - // at least one active lane. We call Legal->blockNeedsPredication here - // because it doesn't query tail-folding. - if (Legal->isUniformMemOp(*I) && isa(I) && - !Legal->blockNeedsPredication(I->getParent())) - return false; if (!blockNeedsPredicationForAnyReason(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated - // instructions. + // instructions. However, for uniform memory ops we know the address isn't + // changing. Since tail folding guarantees at least one active lane, we + // can discount predication inserted solely for that purpose. if (isa(I) || isa(I)) - return Legal->isMaskRequired(I); + return Legal->isUniformMemOp(*I) ? Legal->blockNeedsPredication(I->getParent()) : + Legal->isMaskRequired(I); return isScalarWithPredication(I, VF); } @@ -4617,17 +4611,26 @@ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) addToWorklistIfAllowed(Cmp); + // Return true if all lanes perform the same memory operation, and we can + // thus chose to execute only one. + auto isUniformMemOpUse = [&](Instruction *I) { + if (!Legal->isUniformMemOp(*I)) + return false; + if (isa(I)) + // Loading the same address always produces the same result - at least + // assuming aliasing and ordering which have already been checked. + return true; + // Storing the same value on every iteration. + return TheLoop->isLoopInvariant(cast(I)->getValueOperand()); + }; + auto isUniformDecision = [&](Instruction *I, ElementCount VF) { InstWidening WideningDecision = getWideningDecision(I, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); - // A uniform memory op is itself uniform. We exclude uniform stores - // here as they demand the last lane, not the first one. - if (isa(I) && Legal->isUniformMemOp(*I)) { - assert(WideningDecision == CM_Scalarize); + if (isUniformMemOpUse(I)) return true; - } return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || @@ -4681,9 +4684,7 @@ if (!Ptr) continue; - // A uniform memory op is itself uniform. We exclude uniform stores - // here as they demand the last lane, not the first one. - if (isa(I) && Legal->isUniformMemOp(I)) + if (isUniformMemOpUse(&I)) addToWorklistIfAllowed(&I); if (isUniformDecision(&I, VF)) { Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -202,15 +202,44 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; CHECK-LABEL: @uniform_store( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]] +; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: store i64 [[V]], ptr [[B]], align 8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -649,15 +649,41 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; SCALABLE-LABEL: @uniform_store( ; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE: vector.ph: +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE: vector.body: +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] +; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE: middle.block: +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; SCALABLE: for.body: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; SCALABLE-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 8 -; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8 +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: for.end: ; SCALABLE-NEXT: ret void ; @@ -676,8 +702,6 @@ ; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 -; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 @@ -706,15 +730,44 @@ ; ; TF-SCALABLE-LABEL: @uniform_store( ; TF-SCALABLE-NEXT: entry: +; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]] +; TF-SCALABLE-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TF-SCALABLE: vector.ph: +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-SCALABLE: vector.body: +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE: middle.block: +; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; TF-SCALABLE: scalar.ph: +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; TF-SCALABLE: for.body: -; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; TF-SCALABLE-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 8 -; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 8 +; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; @@ -729,7 +782,6 @@ ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 8 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 ; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 @@ -1058,15 +1110,41 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; SCALABLE-LABEL: @uniform_store_unaligned( ; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE: vector.ph: +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE: vector.body: +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 +; SCALABLE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP2]] +; SCALABLE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 +; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE: middle.block: +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; SCALABLE: for.body: -; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; SCALABLE-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 1 -; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: for.end: ; SCALABLE-NEXT: ret void ; @@ -1085,8 +1163,6 @@ ; FIXEDLEN-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 ; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 -; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 -; FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 @@ -1138,7 +1214,6 @@ ; TF-FIXEDLEN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; TF-FIXEDLEN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 -; TF-FIXEDLEN-NEXT: store i64 [[V]], ptr [[B]], align 1 ; TF-FIXEDLEN-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; TF-FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 ; TF-FIXEDLEN-NEXT: store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 Index: llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -90,12 +90,14 @@ ; FORCE: vector.body: ; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] ; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ] +; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 ; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], +; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1 +; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1 ; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; FORCE: pred.load.if: -; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1 ; FORCE-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]] ; FORCE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1 ; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]] @@ -104,8 +106,6 @@ ; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 ; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]] ; FORCE: pred.load.if1: -; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 -; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1 ; FORCE-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]] ; FORCE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1 ; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]] Index: llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -194,18 +194,6 @@ ; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 ; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 ; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 -; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -216,49 +216,49 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK: pred.load.if: ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8 -; CHECK-NEXT: store i64 [[TMP4]], i64* [[B:%.*]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_STORE_IF]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] +; CHECK: pred.load.continue: +; CHECK-NEXT: [[TMP5:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK: pred.load.if1: ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 -; CHECK-NEXT: store i64 [[TMP9]], i64* [[B]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP9]], [[PRED_STORE_IF1]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP9]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK: pred.load.if3: ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP13]], align 8 -; CHECK-NEXT: store i64 [[TMP14]], i64* [[B]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE2]] ], [ [[TMP14]], [[PRED_STORE_IF3]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK: pred.load.continue4: +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE2]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.if5: ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = load i64, i64* [[TMP18]], align 8 -; CHECK-NEXT: store i64 [[TMP19]], i64* [[B]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE4]] ], [ [[TMP19]], [[PRED_STORE_IF5]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP20:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE4]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: store i64 [[TMP5]], i64* [[B:%.*]], align 8 +; CHECK-NEXT: store i64 [[TMP10]], i64* [[B]], align 8 +; CHECK-NEXT: store i64 [[TMP15]], i64* [[B]], align 8 +; CHECK-NEXT: store i64 [[TMP20]], i64* [[B]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 @@ -285,51 +285,51 @@ ; VF2UF2: vector.ph: ; VF2UF2-NEXT: br label [[VECTOR_BODY:%.*]] ; VF2UF2: vector.body: -; VF2UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] -; VF2UF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ] +; VF2UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE7:%.*]] ] +; VF2UF2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE7]] ] ; VF2UF2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; VF2UF2-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], ; VF2UF2-NEXT: [[TMP1:%.*]] = icmp ule <2 x i64> [[STEP_ADD]], ; VF2UF2-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 -; VF2UF2-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; VF2UF2: pred.store.if: +; VF2UF2-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; VF2UF2: pred.load.if: ; VF2UF2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 ; VF2UF2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP3]] ; VF2UF2-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8 -; VF2UF2-NEXT: store i64 [[TMP5]], i64* [[B:%.*]], align 8 -; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE]] -; VF2UF2: pred.store.continue: -; VF2UF2-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ] +; VF2UF2-NEXT: br label [[PRED_LOAD_CONTINUE]] +; VF2UF2: pred.load.continue: +; VF2UF2-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ] ; VF2UF2-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 -; VF2UF2-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] -; VF2UF2: pred.store.if2: +; VF2UF2-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3:%.*]] +; VF2UF2: pred.load.if2: ; VF2UF2-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 ; VF2UF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]] ; VF2UF2-NEXT: [[TMP10:%.*]] = load i64, i64* [[TMP9]], align 8 -; VF2UF2-NEXT: store i64 [[TMP10]], i64* [[B]], align 8 -; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE3]] -; VF2UF2: pred.store.continue3: -; VF2UF2-NEXT: [[TMP11:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP10]], [[PRED_STORE_IF2]] ] +; VF2UF2-NEXT: br label [[PRED_LOAD_CONTINUE3]] +; VF2UF2: pred.load.continue3: +; VF2UF2-NEXT: [[TMP11:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], [[PRED_LOAD_IF2]] ] ; VF2UF2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; VF2UF2-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; VF2UF2: pred.store.if4: +; VF2UF2-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] +; VF2UF2: pred.load.if4: ; VF2UF2-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 2 ; VF2UF2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP13]] ; VF2UF2-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8 -; VF2UF2-NEXT: store i64 [[TMP15]], i64* [[B]], align 8 -; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE5]] -; VF2UF2: pred.store.continue5: -; VF2UF2-NEXT: [[TMP16:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE3]] ], [ [[TMP15]], [[PRED_STORE_IF4]] ] +; VF2UF2-NEXT: br label [[PRED_LOAD_CONTINUE5]] +; VF2UF2: pred.load.continue5: +; VF2UF2-NEXT: [[TMP16:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE3]] ], [ [[TMP15]], [[PRED_LOAD_IF4]] ] ; VF2UF2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; VF2UF2-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.if6: +; VF2UF2-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7]] +; VF2UF2: pred.load.if6: ; VF2UF2-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 3 ; VF2UF2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP18]] ; VF2UF2-NEXT: [[TMP20:%.*]] = load i64, i64* [[TMP19]], align 8 -; VF2UF2-NEXT: store i64 [[TMP20]], i64* [[B]], align 8 -; VF2UF2-NEXT: br label [[PRED_STORE_CONTINUE7]] -; VF2UF2: pred.store.continue7: -; VF2UF2-NEXT: [[TMP21:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE5]] ], [ [[TMP20]], [[PRED_STORE_IF6]] ] +; VF2UF2-NEXT: br label [[PRED_LOAD_CONTINUE7]] +; VF2UF2: pred.load.continue7: +; VF2UF2-NEXT: [[TMP21:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE5]] ], [ [[TMP20]], [[PRED_LOAD_IF6]] ] +; VF2UF2-NEXT: store i64 [[TMP6]], i64* [[B:%.*]], align 8 +; VF2UF2-NEXT: store i64 [[TMP11]], i64* [[B]], align 8 +; VF2UF2-NEXT: store i64 [[TMP16]], i64* [[B]], align 8 +; VF2UF2-NEXT: store i64 [[TMP21]], i64* [[B]], align 8 ; VF2UF2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], ; VF2UF2-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 @@ -356,7 +356,7 @@ ; VF1UF4: vector.ph: ; VF1UF4-NEXT: br label [[VECTOR_BODY:%.*]] ; VF1UF4: vector.body: -; VF1UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] +; VF1UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE12:%.*]] ] ; VF1UF4-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 ; VF1UF4-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 ; VF1UF4-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 @@ -365,42 +365,42 @@ ; VF1UF4-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 13 ; VF1UF4-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 13 ; VF1UF4-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 13 -; VF1UF4-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; VF1UF4: pred.store.if: +; VF1UF4-NEXT: br i1 [[TMP0]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; VF1UF4: pred.load.if: ; VF1UF4-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 ; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDUCTION]] ; VF1UF4-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8 -; VF1UF4-NEXT: store i64 [[TMP5]], i64* [[B:%.*]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]] -; VF1UF4: pred.store.continue: -; VF1UF4-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ] -; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; VF1UF4: pred.store.if7: +; VF1UF4-NEXT: br label [[PRED_LOAD_CONTINUE]] +; VF1UF4: pred.load.continue: +; VF1UF4-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ] +; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; VF1UF4: pred.load.if7: ; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 ; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION1]] ; VF1UF4-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 8 -; VF1UF4-NEXT: store i64 [[TMP8]], i64* [[B]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE8]] -; VF1UF4: pred.store.continue8: -; VF1UF4-NEXT: [[TMP9:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP8]], [[PRED_STORE_IF7]] ] -; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; VF1UF4: pred.store.if9: +; VF1UF4-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; VF1UF4: pred.load.continue8: +; VF1UF4-NEXT: [[TMP9:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP8]], [[PRED_LOAD_IF7]] ] +; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; VF1UF4: pred.load.if9: ; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2 ; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION2]] ; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8 -; VF1UF4-NEXT: store i64 [[TMP11]], i64* [[B]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE10]] -; VF1UF4: pred.store.continue10: -; VF1UF4-NEXT: [[TMP12:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE8]] ], [ [[TMP11]], [[PRED_STORE_IF9]] ] -; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] -; VF1UF4: pred.store.if11: +; VF1UF4-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; VF1UF4: pred.load.continue10: +; VF1UF4-NEXT: [[TMP12:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE8]] ], [ [[TMP11]], [[PRED_LOAD_IF9]] ] +; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12]] +; VF1UF4: pred.load.if11: ; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3 ; VF1UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDUCTION3]] ; VF1UF4-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP13]], align 8 -; VF1UF4-NEXT: store i64 [[TMP14]], i64* [[B]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE12]] -; VF1UF4: pred.store.continue12: -; VF1UF4-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE10]] ], [ [[TMP14]], [[PRED_STORE_IF11]] ] +; VF1UF4-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; VF1UF4: pred.load.continue12: +; VF1UF4-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_LOAD_CONTINUE10]] ], [ [[TMP14]], [[PRED_LOAD_IF11]] ] +; VF1UF4-NEXT: store i64 [[TMP6]], i64* [[B:%.*]], align 8 +; VF1UF4-NEXT: store i64 [[TMP9]], i64* [[B]], align 8 +; VF1UF4-NEXT: store i64 [[TMP12]], i64* [[B]], align 8 +; VF1UF4-NEXT: store i64 [[TMP15]], i64* [[B]], align 8 ; VF1UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; VF1UF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; VF1UF4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll +++ llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -24,31 +24,14 @@ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[INC]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.if3: ; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: Index: llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll +++ llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll @@ -49,7 +49,6 @@ ; CHECK-NEXT: store i32 0, i32* @f.e, align 1, !alias.scope !0, !noalias !3 ; CHECK-NEXT: store i32 0, i32* @f.e, align 1, !alias.scope !0, !noalias !3 ; CHECK-NEXT: store i8 10, i8* [[TMP0]], align 1 -; CHECK-NEXT: store i8 10, i8* [[TMP0]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] Index: llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -16,42 +16,42 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[INDUCTION]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[INDUCTION1]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[INDUCTION2]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[INDUCTION3]], 14 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] +; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14 ; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[SUNK_IND0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[SUNK_IND0]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[INDUCTION]] ; CHECK-NEXT: store i32 0, i32* [[TMP4]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[SUNK_IND1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND1]] +; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION1]] ; CHECK-NEXT: store i32 0, i32* [[TMP5]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] +; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] ; CHECK: pred.store.if9: -; CHECK-NEXT: [[SUNK_IND2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND2]] +; CHECK-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION2]] ; CHECK-NEXT: store i32 0, i32* [[TMP6]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] ; CHECK: pred.store.continue10: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] +; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.if11: -; CHECK-NEXT: [[SUNK_IND3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND3]] +; CHECK-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION3]] ; CHECK-NEXT: store i32 0, i32* [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] ; CHECK: pred.store.continue12: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16