Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6288,6 +6288,18 @@ UserCost += TTI->getVectorInstrCost( Instruction::InsertElement, IE->getType(), CI->getZExtValue()); } + + // Double UserCost in case vector is used by StoreInst: + // the first cost to compensate ExtractCost (see above) + // and the second one to consider that vector is already + // prepared for vector store eliminating its redundant building + // with insert instructions. + for (User *U: InsertUses[I + OpsWidth - 1]->users()) + if (isa(U)) { + UserCost *= 2; + break; + } + LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost << ".\n"); Cost -= UserCost; Index: llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll +++ llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll @@ -84,15 +84,12 @@ define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, <4 x i32>* nocapture %4) { ; CHECK-LABEL: @test2_vec( -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP0:%.*]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP1:%.*]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP2:%.*]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP3:%.*]], 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3 -; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP4:%.*]], align 16, [[TBAA0]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], +; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP4:%.*]], align 16, [[TBAA0]] ; CHECK-NEXT: ret void ; %6 = add nsw i32 %0, 1