diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6795,6 +6795,131 @@ BoUpSLP &R; constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost getBuildVectorCost(ArrayRef VL, Value *Root) { + if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof)) + return TTI::TCC_Free; + auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size()); + InstructionCost GatherCost = 0; + SmallVector Gathers(VL.begin(), VL.end()); + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + InstructionsState S = getSameOpcode(VL, *R.TLI); + if (VL.size() > 2 && S.getOpcode() == Instruction::Load && + !S.isAltShuffle() && + !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && + !isSplat(Gathers)) { + BoUpSLP::ValueSet VectorizedLoads; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + unsigned VectorizedCnt = 0; + unsigned ScatterVectorizeCnt = 0; + const unsigned Sz = R.DL->getTypeSizeInBits(S.MainOp->getType()); + for (unsigned MinVF = R.getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = + canVectorizeLoads(Slice, Slice.front(), TTI, *R.DL, *R.SE, + *R.LI, *R.TLI, CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + if (LS == LoadsState::Vectorize) + ++VectorizedCnt; + else + ++ScatterVectorizeCnt; + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize + // it again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + unsigned NumParts = TTI.getNumberOfParts(VecTy); + bool NeedInsertSubvectorAnalysis = + !NumParts || (VL.size() / VF) > NumParts; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += getBuildVectorCost(VL.slice(I, VF), Root); + } + // Exclude potentially vectorized loads from list of gathered + // scalars. + auto *LI = cast(S.MainOp); + Gathers.assign(Gathers.size(), PoisonValue::get(LI->getType())); + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast(V); + ScalarsCost += + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), + LI->getAlign(), LI->getPointerAddressSpace(), + CostKind, TTI::OperandValueInfo(), LI); + } + auto *LoadTy = FixedVectorType::get(LI->getType(), VF); + Align Alignment = LI->getAlign(); + GatherCost += + VectorizedCnt * + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI); + GatherCost += ScatterVectorizeCnt * + TTI.getGatherScatterOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + if (NeedInsertSubvectorAnalysis) { + // Add the cost for the subvectors insert. + for (int I = VF, E = VL.size(); I < E; I += VF) + GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, + std::nullopt, CostKind, I, LoadTy); + } + GatherCost -= ScalarsCost; + } + } else if (!Root && isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as + // the broadcast. + const auto *It = + find_if(VL, [](Value *V) { return !isa(V); }); + assert(It != VL.end() && "Expected at least one non-undef value."); + // Add broadcast for non-identity shuffle only. + bool NeedShuffle = + count(VL, *It) > 1 && + (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof)); + InstructionCost InsertCost = TTI.getVectorInstrCost( + Instruction::InsertElement, VecTy, CostKind, + NeedShuffle ? 0 : std::distance(VL.begin(), It), + PoisonValue::get(VecTy), *It); + return InsertCost + + (NeedShuffle ? TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, VecTy, + /*Mask=*/std::nullopt, CostKind, /*Index=*/0, + /*SubTp=*/nullptr, /*Args=*/*It) + : TTI::TCC_Free); + } + return GatherCost + (all_of(Gathers, UndefValue::classof) + ? TTI::TCC_Free + : R.getGatherCost(Gathers)); + }; + public: ShuffleCostEstimator(TargetTransformInfo &TTI, ArrayRef VectorizedVals, BoUpSLP &R) @@ -6887,6 +7012,9 @@ } return VecBase; } + void gather(ArrayRef VL, Value *Root = nullptr) { + Cost += getBuildVectorCost(VL, Root); + } /// Finalize emission of the shuffles. InstructionCost finalize() { IsFinalized = true; @@ -6957,7 +7085,6 @@ GatheredScalars.append(VF - GatheredScalars.size(), PoisonValue::get(ScalarTy)); } - InstructionCost ExtractCost = Estimator.finalize(); // Do not try to look for reshuffled loads for gathered loads (they will be // handled later), for vectorized scalars, and cases, which are definitely @@ -7007,9 +7134,11 @@ ::addMask(Mask, E->ReuseShuffleIndices); GatherCost = TTI->getShuffleCost(*GatherShuffle, FinalVecTy, Mask); } - if (!all_of(GatheredScalars, UndefValue::classof)) - GatherCost += getGatherCost(GatheredScalars); - return GatherCost; + Estimator.gather( + GatheredScalars, + Constant::getNullValue(FixedVectorType::get( + GatheredScalars.front()->getType(), GatheredScalars.size()))); + return GatherCost + Estimator.finalize(); } if (ExtractShuffle && all_of(GatheredScalars, PoisonValue::classof)) { // Check that gather of extractelements can be represented as just a @@ -7022,129 +7151,14 @@ if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - return Cost + ExtractCost; - } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - assert(VecTy == FinalVecTy && - "No reused scalars expected for broadcast."); - const auto *It = - find_if(VL, [](Value *V) { return !isa(V); }); - // If all values are undefs - consider cost free. - if (It == VL.end()) - return TTI::TCC_Free; - // Add broadcast for non-identity shuffle only. - bool NeedShuffle = - count(VL, *It) > 1 && - (VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof)); - InstructionCost InsertCost = TTI->getVectorInstrCost( - Instruction::InsertElement, VecTy, CostKind, - NeedShuffle ? 0 : std::distance(VL.begin(), It), - PoisonValue::get(VecTy), *It); - return InsertCost + (NeedShuffle - ? TTI->getShuffleCost( - TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/std::nullopt, CostKind, - /*Index=*/0, - /*SubTp=*/nullptr, /*Args=*/*It) - : TTI::TCC_Free); + return Cost + Estimator.finalize(); } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - if (VL.size() > 2 && E->getOpcode() == Instruction::Load && - !E->isAltShuffle()) { - BoUpSLP::ValueSet VectorizedLoads; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); - for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, *SE, *LI, - *TLI, CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; - else - ++ScatterVectorizeCnt; - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - InstructionCost GatherCost = 0; - unsigned NumParts = TTI->getNumberOfParts(VecTy); - bool NeedInsertSubvectorAnalysis = - !NumParts || (VL.size() / VF) > NumParts; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += getGatherCost(VL.slice(I, VF)); - } - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += - TTI->getMemoryOpCost(Instruction::Load, LI->getType(), - LI->getAlign(), LI->getPointerAddressSpace(), - CostKind, TTI::OperandValueInfo(), LI); - } - auto *LI = cast(E->getMainOp()); - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo(), LI); - GatherCost += ScatterVectorizeCnt * - TTI->getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - if (NeedInsertSubvectorAnalysis) { - // Add the cost for the subvectors insert. - for (int I = VF, E = VL.size(); I < E; I += VF) - GatherCost += - TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - std::nullopt, CostKind, I, LoadTy); - } - return ReuseShuffleCost + GatherCost - ScalarsCost; - } - } - return ReuseShuffleCost + getGatherCost(VL); + Estimator.gather(GatheredScalars); + return ReuseShuffleCost + Estimator.finalize(); } InstructionCost CommonCost = 0; SmallVector Mask; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -3,27 +3,21 @@ define void @test(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1:%.*]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP4]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP14]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; CHECK-NEXT: br label [[TMP17:%.*]] -; CHECK: 17: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[TMP22:%.*]], [[TMP17]] ], [ [[TMP6]], [[TMP3:%.*]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP9]], [[TMP3]] ] -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP13]], [[TMP3]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ 0, [[TMP17]] ], [ [[TMP16]], [[TMP3]] ] -; CHECK-NEXT: [[TMP22]] = or i32 [[TMP18]], 0 -; CHECK-NEXT: br label [[TMP17]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1:%.*]], <2 x i64> [[TMP0:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> , <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i64> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-NEXT: br label [[TMP11:%.*]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[TMP11]] ], [ [[TMP10]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> zeroinitializer, [[TMP13]] +; CHECK-NEXT: [[TMP16]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: br label [[TMP11]] ; %4 = extractelement <2 x i64> %1, i64 0 %5 = or i64 %4, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -524,36 +524,22 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { -; SSE-LABEL: @foo( -; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] -; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; SSE-NEXT: ret i1 [[CMP_I185]] -; -; AVX-LABEL: @foo( -; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 -; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]] -; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]] -; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 -; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01 -; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]] -; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01 -; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]] -; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]] -; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 -; AVX-NEXT: ret i1 [[CMP_I185]] +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 +; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP_I185]] ; %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0 %sub14.i167 = fsub float undef, %vecext.i291.i166 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll @@ -6,21 +6,23 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5 ; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6 +; CHECK-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7 ; CHECK-NEXT: br label [[WHILE:%.*]] ; CHECK: while: -; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX25:%.*]], [[WHILE]] ] +; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A1]], align 16 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> zeroinitializer, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX23:%.*]] = xor i64 0, [[TMP1]] -; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP8]] -; CHECK-NEXT: [[OP_RDX25]] = xor i64 [[OP_RDX23]], [[OP_RDX24]] +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP0]], [[TMP11]] ; CHECK-NEXT: br label [[WHILE]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -93,33 +93,20 @@ } define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { -; SSE-LABEL: @logical_and_icmp_diff_preds( -; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 -; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false -; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false -; SSE-NEXT: ret i1 [[S3]] -; -; AVX-LABEL: @logical_and_icmp_diff_preds( -; AVX-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; AVX-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[X]], i32 1 -; AVX-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; AVX-NEXT: [[X3:%.*]] = extractelement <4 x i32> [[X]], i32 3 -; AVX-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 -; AVX-NEXT: [[C1:%.*]] = icmp slt i32 [[X1]], 0 -; AVX-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 -; AVX-NEXT: [[C3:%.*]] = icmp slt i32 [[X3]], 0 -; AVX-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false -; AVX-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; AVX-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[C3]], i1 false -; AVX-NEXT: ret i1 [[S3]] +; CHECK-LABEL: @logical_and_icmp_diff_preds( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP5]], i32 2 +; CHECK-NEXT: [[S1:%.*]] = select i1 [[TMP6]], i1 [[TMP7]], i1 false +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP5]], i32 3 +; CHECK-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[TMP8]], i1 false +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP9]], i1 false +; CHECK-NEXT: ret i1 [[S3]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1