diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3654,7 +3654,6 @@ else if (auto *IE = dyn_cast(VL[0])) ScalarTy = IE->getOperand(1)->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); - auto *FinalVecTy = VecTy; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so @@ -3662,6 +3661,7 @@ if (MinBWs.count(VL[0])) VecTy = FixedVectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + auto *FinalVecTy = VecTy; unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3838,7 +3838,6 @@ case Instruction::ExtractElement: { // The common cost of removal ExtractElement/ExtractValue instructions + // the cost of shuffles, if required to resuffle the original vector. - InstructionCost CommonCost = 0; if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { @@ -4471,7 +4470,6 @@ SmallPtrSet ExtractCostCalculated; InstructionCost ExtractCost = 0; - SmallBitVector IsIdentity; SmallVector VF; SmallVector> ShuffleMask; SmallVector FirstUsers; @@ -4528,15 +4526,12 @@ ShuffleMask.emplace_back(VF.back(), UndefMaskElem); FirstUsers.push_back(EU.User); DemandedElts.push_back(APInt::getNullValue(VF.back())); - IsIdentity.push_back(true); VecId = FirstUsers.size() - 1; } else { VecId = std::distance(FirstUsers.begin(), It); } int Idx = *InsertIdx; ShuffleMask[VecId][Idx] = EU.Lane; - IsIdentity.set(IsIdentity.test(VecId) & - (EU.Lane == Idx || EU.Lane == UndefMaskElem)); DemandedElts[VecId].setBit(Idx); } } @@ -4562,7 +4557,8 @@ InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; for (int I = 0, E = FirstUsers.size(); I < E; ++I) { - if (!IsIdentity.test(I)) { + // For the very first element - simple shuffle of the source vector. + if (I == 0 && !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, cast(FirstUsers[I]->getType()), ShuffleMask[I]); @@ -4571,10 +4567,15 @@ << *VectorizableTree.front()->Scalars.front() << ".\n" << "SLP: Current total cost = " << Cost << "\n"); Cost += C; + continue; } + // Other elements - permutation of 2 vectors (the initial one and the next + // Ith incoming vector). unsigned VF = ShuffleMask[I].size(); - for (int &Mask : ShuffleMask[I]) - Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask; + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int &Mask = ShuffleMask[I][Idx]; + Mask = Mask == UndefMaskElem ? Idx : VF + Mask; + } InstructionCost C = TTI->getShuffleCost( TTI::SK_PermuteTwoSrc, cast(FirstUsers[I]->getType()), ShuffleMask[I]); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -3,15 +3,17 @@ define void @f1(<2 x i16> %x, i16* %a) { ; CHECK-LABEL: @f1( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A:%.*]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]], align 2 ; CHECK-NEXT: ret void ; %t2 = extractelement <2 x i16> %x, i32 0 @@ -35,15 +37,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]], align 2 ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] @@ -82,15 +86,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[A]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR0]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR1]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR2]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR3]], align 2 ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -40,22 +40,22 @@ ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5 -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7 +; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -40,22 +40,22 @@ ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1 +; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 ; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5 -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7 +; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], ; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4