Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -289,6 +289,19 @@ /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); + /// A uniform memory op is a load or store which accesses the same memory + /// location on all lanes. + bool isUniformMemOp(Instruction &I) { + Value *Ptr = getLoadStorePointerOperand(&I); + if (!Ptr) + return false; + // Note: There's nothing inherent which prevents predicated loads and + // stores from being uniform. The current lowering simply doesn't handle + // it; in particular, the cost model distinguishes scatter/gather from + // scalar w/predication, and we currently rely on the scalar path. + return isUniform(Ptr) && !blockNeedsPredication(I.getParent()); + } + /// Returns the information that we collected about runtime memory check. const RuntimePointerChecking *getRuntimePointerChecking() const { return LAI->getRuntimePointerChecking(); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5030,6 +5030,8 @@ // replicating region where only a single instance out of VF should be formed. // TODO: optimize such seldom cases if found important, see PR40816. auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isOutOfScope(I)) + return; if (isScalarWithPredication(I, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); @@ -5060,6 +5062,15 @@ assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); + // The address of a uniform mem op is itself uniform. We exclude stores + // here as there's an assumption in the current code that all uses of + // uniform instructions are uniform and, as noted below, uniform stores are + // still handled via replication (i.e. aren't uniform after vectorization). + if (isa(I) && Legal->isUniformMemOp(*I)) { + assert(WideningDecision == CM_Scalarize); + return true; + } + return (WideningDecision == CM_Widen || WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); @@ -5079,6 +5090,12 @@ if (!Ptr) continue; + // A uniform memory op is itself uniform. We exclude stores here as we + // haven't yet added dedicated logic in the CLONE path and rely on + // REPLICATE + DSE for correctness. + if (isa(I) && Legal->isUniformMemOp(I)) + addToWorklistIfAllowed(&I); + // True if all users of Ptr are memory accesses that have Ptr as their // pointer operand. auto UsersAreMemAccesses = @@ -6231,6 +6248,8 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, ElementCount VF) { + assert(Legal->isUniformMemOp(*I)); + Type *ValTy = getMemInstValueType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6408,11 +6427,7 @@ if (isa(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (Legal->isUniform(Ptr) && - // Conditional loads and stores should be scalarized and predicated. - // isScalarWithPredication cannot be used here since masked - // gather/scatters are not considered scalar with predication. - !Legal->blockNeedsPredication(I.getParent())) { + if (Legal->isUniformMemOp(I)) { // TODO: Avoid replicating loads and stores instead of // relying on instcombine to remove them. // Load: Scalar load + broadcast Index: llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -68,24 +68,19 @@ ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !0 ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP1]], align 4, !alias.scope !3 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4, !alias.scope !5, !noalias !7 -; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD12]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP7]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, !alias.scope !5, !noalias !7 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]] @@ -100,17 +95,17 @@ ; CHECK-NEXT: br i1 [[EXITCOND_OUTER]], label [[DOTEXIT:%.*]], label [[DOTOUTER_PREHEADER]] ; CHECK: .inner: ; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[DOTINNER]] ] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]] -; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = add nsw i32 [[TMP23]], [[TMP22]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = add nsw i32 [[TMP24]], [[TMP26]] -; CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP25]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP17]], [[TMP19]] +; CHECK-NEXT: store i32 [[TMP20]], i32* [[TMP18]], align 4 ; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 ; CHECK-NEXT: [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]] -; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop !10 +; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], [[LOOP10:!llvm.loop !.*]] ; br label %.outer.preheader