diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4765,6 +4765,14 @@ Builder, ConstantInt::get(PtrInd->getType(), Part), VF); if (NeedsVectorIndex) { + // Here we cache the whole vector, which means we can support the + // extraction of any lane. However, in some cases the extractelement + // instruction that is generated for scalar uses of this vector (e.g. + // a load instruction) is not folded away. Therefore we still + // calculate values for the first n lanes to avoid redundant moves + // (when extracting the 0th element) and to produce scalar code (i.e. + // additional add/gep instructions instead of expensive extractelement + // instructions) when extracting higher-order elements. Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart); Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); @@ -4772,9 +4780,6 @@ emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, Part); - // We've cached the whole vector, which means we can support the - // extraction of any lane. - continue; } for (unsigned Lane = 0; Lane < Lanes; ++Lane) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -50,22 +50,25 @@ ; CHECK-NEXT: [[TMP7:%.*]] = add shufflevector ( insertelement ( poison, i64 0, i32 0), poison, zeroinitializer), [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = add [[DOTSPLAT]], [[TMP7]] ; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, [[NEXT_GEP4]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8** [[TMP10]] to * -; CHECK-NEXT: store [[TMP9]], * [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[NEXT_GEP4]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP14]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP13]] to * -; CHECK-NEXT: store [[TMP15]], * [[TMP16]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, [[NEXT_GEP4]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to * +; CHECK-NEXT: store [[TMP11]], * [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to * +; CHECK-NEXT: store [[TMP16]], * [[TMP17]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -131,6 +131,7 @@ ; CHECK-NEXT: %[[TMP2:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer ; CHECK-NEXT: %[[VECIND1:.*]] = add %[[TMP2]], %[[STEPVEC]] ; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, %[[VECIND1]] +; CHECK-NEXT: %[[GEPA1:.*]] = getelementptr i32, i32* %a, i64 %[[IDX]] ; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1 ; CHECK-NEXT: %[[TMP3:.*]] = insertelement poison, i64 %[[VSCALE64X2]], i32 0 @@ -139,6 +140,10 @@ ; CHECK-NEXT: %[[VECIND2:.*]] = add %[[TMP2]], %[[TMP5]] ; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, %[[VECIND2]] ; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]] +; The following checks that there is no extractelement after +; vectorization when the stepvector has multiple uses, which demonstrates +; the removal of a redundant fmov instruction in the generated asm code. +; CHECK-NOT: %[[EXTRACT:.*]] = extractelement [[APTRS1]], i32 0 ; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to * ; CHECK-NEXT: store %[[APTRS1]], * %[[BPTR1]], align 8 ; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()