Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6823,6 +6823,13 @@ // Scalarization of fixed length vectors "just works". return true; + // We have dedicated lowering for unpredicated uniform loads and + // stores. Note that even with tail folding we know that at least + // one lane is active (i.e. generalized predication is not possible + // here), and the logic below depends on this fact. + if (!foldTailByMasking()) + return true; + // For scalable vectors, a uniform memop load is always // uniform-by-parts and we know how to scalarize that. if (isa(I)) @@ -6840,12 +6847,11 @@ // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - // TODO: Avoid replicating loads and stores instead of relying on - // instcombine to remove them. + // FIXME: This cost is a significant under-estimate for tail folded + // memory ops. const InstructionCost ScalarizationCost = isLegalToScalarize() ? getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid(); - // Choose better solution for the current VF, Note that Invalid // costs compare as maximumal large. If both are invalid, we get // scalable invalid which signals a failure and a vectorization abort. Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -15,8 +15,6 @@ ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16* [[DST:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -25,12 +23,16 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to * ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 2 -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP10]] +; CHECK-NEXT: store i16 [[TMP11]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -934,32 +934,29 @@ ; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; SCALABLE-NEXT: [[TMP3:%.*]] = add [[TMP2]], zeroinitializer -; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP4]] -; SCALABLE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i32 0 -; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i32 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 -; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i32 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: call void @llvm.masked.scatter.nxv1i64.nxv1p0( [[VEC_IND]], [[BROADCAST_SPLAT]], i32 8, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; SCALABLE-NEXT: store [[BROADCAST_SPLAT2]], ptr [[TMP9]], align 8 -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i32 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[TMP3:%.*]] = add zeroinitializer, [[TMP2]] +; SCALABLE-NEXT: [[TMP4:%.*]] = mul [[TMP3]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP5:%.*]] = add [[DOTSPLAT]], [[TMP4]] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; SCALABLE-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractelement [[TMP5]], i32 [[TMP8]] +; SCALABLE-NEXT: store i64 [[TMP9]], ptr [[B:%.*]], align 8 +; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 +; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8 +; SCALABLE-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]