diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1435,16 +1435,16 @@ })); } - /// Returns true if \p I is an instruction that will be scalarized with - /// predication when vectorizing \p I with vectorization factor \p VF. Such - /// instructions include conditional stores and instructions that may divide - /// by zero. + /// Returns true if \p I is an instruction which requires predication and + /// for which our chosen predication strategy is scalarization (i.e. we + /// don't have an alternate strategy such as masking available). + /// \p VF is the vectorization factor that will be used to vectorize \p I. bool isScalarWithPredication(Instruction *I, ElementCount VF) const; - // Returns true if \p I is an instruction that will be predicated either - // through scalar predication or masked load/store or masked gather/scatter. - // \p VF is the vectorization factor that will be used to vectorize \p I. - // Superset of instructions that return true for isScalarWithPredication. + /// Returns true if \p I is an instruction that needs to be predicated + /// at runtime. The result is independent of the predication mechanism. + /// \p VF is the vectorization factor that will be used to vectorize \p I. + /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I, ElementCount VF) const; /// Returns true if \p I is a memory instruction with consecutive memory @@ -4412,15 +4412,16 @@ bool LoopVectorizationCostModel::isScalarWithPredication( Instruction *I, ElementCount VF) const { - if (!blockNeedsPredicationForAnyReason(I->getParent())) + if (!isPredicatedInst(I, VF)) return false; + + // Do we have a non-scalar lowering for this predicated + // instruction? No - it is scalar with predication. switch(I->getOpcode()) { default: - break; + return true; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) - return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); Type *VTy = Ty; @@ -4432,41 +4433,48 @@ : !(isLegalMaskedStore(Ty, Ptr, Alignment) || TTI.isLegalMaskedScatter(VTy, Alignment)); } - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::SRem: - case Instruction::URem: - // TODO: We can use the loop-preheader as context point here and get - // context sensitive reasoning - return !isSafeToSpeculativelyExecute(I); } - return false; } bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I, ElementCount VF) const { - // When we know the load's address is loop invariant and the instruction - // in the original scalar loop was unconditionally executed then we - // don't need to mark it as a predicated instruction. Tail folding may - // introduce additional predication, but we're guaranteed to always have - // at least one active lane. We call Legal->blockNeedsPredication here - // because it doesn't query tail-folding. For stores, we need to prove - // both speculation safety (which follows from the same argument as loads), - // but also must prove the value being stored is correct. The easiest - // form of the later is to require that all values stored are the same. - if (Legal->isUniformMemOp(*I) && + if (!blockNeedsPredicationForAnyReason(I->getParent())) + return false; + + // Can we prove this instruction is safe to unconditionally execute? + // If not, we must use some form of predication. + switch(I->getOpcode()) { + default: + return false; + case Instruction::Load: + case Instruction::Store: { + if (!Legal->isMaskRequired(I)) + return false; + // When we know the load's address is loop invariant and the instruction + // in the original scalar loop was unconditionally executed then we + // don't need to mark it as a predicated instruction. Tail folding may + // introduce additional predication, but we're guaranteed to always have + // at least one active lane. We call Legal->blockNeedsPredication here + // because it doesn't query tail-folding. For stores, we need to prove + // both speculation safety (which follows from the same argument as loads), + // but also must prove the value being stored is correct. The easiest + // form of the later is to require that all values stored are the same. + if (Legal->isUniformMemOp(*I) && (isa(I) || (isa(I) && TheLoop->isLoopInvariant(cast(I)->getValueOperand()))) && - !Legal->blockNeedsPredication(I->getParent())) - return false; - if (!blockNeedsPredicationForAnyReason(I->getParent())) - return false; - // Loads and stores that need some form of masked operation are predicated - // instructions. - if (isa(I) || isa(I)) - return Legal->isMaskRequired(I); - return isScalarWithPredication(I, VF); + !Legal->blockNeedsPredication(I->getParent())) + return false; + return true; + } + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::URem: + // TODO: We can use the loop-preheader as context point here and get + // context sensitive reasoning + return !isSafeToSpeculativelyExecute(I); + } } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( @@ -6070,7 +6078,7 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) && + assert((isPredicatedInst(I, VF)) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -584,15 +584,44 @@ ; ; TF-SCALABLE-LABEL: @uniform_load_unaligned( ; TF-SCALABLE-NEXT: entry: +; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]] +; TF-SCALABLE-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TF-SCALABLE: vector.ph: +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-SCALABLE: vector.body: +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B:%.*]], align 1 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE: middle.block: +; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; TF-SCALABLE: scalar.ph: +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; TF-SCALABLE: for.body: -; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 1 +; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; @@ -754,7 +783,7 @@ ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: middle.block: ; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; TF-SCALABLE: scalar.ph: @@ -767,7 +796,7 @@ ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; @@ -1191,15 +1220,44 @@ ; ; TF-SCALABLE-LABEL: @uniform_store_unaligned( ; TF-SCALABLE-NEXT: entry: +; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]] +; TF-SCALABLE-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TF-SCALABLE: vector.ph: +; TF-SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]] +; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] +; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; TF-SCALABLE: vector.body: +; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TF-SCALABLE-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024) +; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B:%.*]], align 1 +; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv1i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TF-SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; TF-SCALABLE: middle.block: +; TF-SCALABLE-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; TF-SCALABLE: scalar.ph: +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TF-SCALABLE-NEXT: br label [[FOR_BODY:%.*]] ; TF-SCALABLE: for.body: -; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; TF-SCALABLE-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 1 -; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[B]], align 1 +; TF-SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; TF-SCALABLE-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; TF-SCALABLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; TF-SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; TF-SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; TF-SCALABLE: for.end: ; TF-SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -473,7 +473,7 @@ ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: ; CHECK-NEXT: loop.0: -; CHECK-NEXT: REPLICATE ir<[[L]]> = load ir<%src> +; CHECK-NEXT: CLONE ir<[[L]]> = load ir<%src> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn> ir<[[L]]> ; CHECK-NEXT: Successor(s): loop.0.split ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll --- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll +++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -28,7 +28,6 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 -; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -253,7 +253,7 @@ ; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]> vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> -; CHECK-NEXT: REPLICATE ir<%lv> = load ir<%gep.A.uniform> +; CHECK-NEXT: CLONE ir<%lv> = load ir<%gep.A.uniform> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%iv>, ir<%k> ; CHECK-NEXT: Successor(s): loop.then ; CHECK-EMPTY: