diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3621,6 +3621,27 @@ return Cost; } +/// Shuffles \p Mask in accordance with the given \p SubMask. +static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; + } + SmallVector NewMask(SubMask.size(), SubMask.size()); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) { + NewMask[I] = UndefMaskElem; + continue; + } + NewMask[I] = Mask[SubMask[I]]; + } + Mask.swap(NewMask); +} + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -3633,6 +3654,7 @@ else if (auto *IE = dyn_cast(VL[0])) ScalarTy = IE->getOperand(1)->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + auto *FinalVecTy = VecTy; TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If we have computed a smaller type for the expression, update VecTy so @@ -3643,12 +3665,9 @@ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) { - ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, - E->ReuseShuffleIndices); - } + if (NeedToShuffleReuses) + FinalVecTy = + FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers); // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, @@ -3737,6 +3756,10 @@ dbgs() << "SLP: perfect diamond match for gather bundle that starts with " << *VL.front() << ".\n"); + if (NeedToShuffleReuses) + GatherCost = + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); } else { LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() << " entries for bundle that starts with " @@ -3744,16 +3767,15 @@ // Detected that instead of gather we can emit a shuffle of single/two // previously vectorized nodes. Add the cost of the permutation rather // than gather. - GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask); + ::addMask(Mask, E->ReuseShuffleIndices); + GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); } - return ReuseShuffleCost + GatherCost; + return GatherCost; } if (isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as the // broadcast. - return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, - 0); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL) && @@ -3771,11 +3793,36 @@ InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); AdjustExtractsCost(Cost, /*IsGather=*/true); - return ReuseShuffleCost + Cost; + if (NeedToShuffleReuses) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); + return Cost; } } + InstructionCost ReuseShuffleCost = 0; + if (NeedToShuffleReuses) + ReuseShuffleCost = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return ReuseShuffleCost + getGatherCost(VL); } + InstructionCost CommonCost = 0; + SmallVector Mask; + if (!E->ReorderIndices.empty()) { + SmallVector NewMask; + if (E->getOpcode() == Instruction::Store) { + // For stores the order is actually a mask. + NewMask.resize(E->ReorderIndices.size()); + copy(E->ReorderIndices, NewMask.begin()); + } else { + inversePermutation(E->ReorderIndices, NewMask); + } + ::addMask(Mask, NewMask); + } + if (NeedToShuffleReuses) + ::addMask(Mask, E->ReuseShuffleIndices); + if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) + CommonCost = + TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); @@ -3797,12 +3844,12 @@ for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(VL[I]); - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); ++Idx; } } @@ -3810,21 +3857,15 @@ for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *EE = cast(V); - ReuseShuffleCost += TTI->getVectorInstrCost( - Instruction::ExtractElement, EE->getVectorOperandType(), - *getExtractIndex(EE)); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { --Idx; - ReuseShuffleCost += TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); } } - CommonCost = ReuseShuffleCost; - } else if (!E->ReorderIndices.empty()) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - CommonCost = TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } if (ShuffleOrOp == Instruction::ExtractValue) { for (unsigned I = 0, E = VL.size(); I < E; ++I) { @@ -3915,7 +3956,7 @@ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. @@ -3925,12 +3966,11 @@ InstructionCost VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = - ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); + VecCost = CommonCost + TTI->getCastInstrCost( + E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -3941,7 +3981,7 @@ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; @@ -3982,8 +4022,8 @@ CmpInst::BAD_ICMP_PREDICATE, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::FNeg: case Instruction::Add: @@ -4046,14 +4086,14 @@ TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -4064,13 +4104,13 @@ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost( Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. @@ -4078,7 +4118,7 @@ InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; @@ -4095,14 +4135,8 @@ Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, Alignment, CostKind, VL0); } - if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); - return ReuseShuffleCost + VecLdCost - ScalarLdCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); + return CommonCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. @@ -4115,14 +4149,8 @@ InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) { - SmallVector NewMask; - inversePermutation(E->ReorderIndices, NewMask); - VecStCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); - return VecStCost - ScalarStCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); + return CommonCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -4133,7 +4161,7 @@ InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -4145,7 +4173,7 @@ << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); - return ReuseShuffleCost + VecCallCost - ScalarCallCost; + return CommonCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { assert(E->isAltShuffle() && @@ -4158,11 +4186,11 @@ if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); + CommonCost -= TTI->getInstructionCost(I, CostKind); } for (Value *V : VL) { Instruction *I = cast(V); - ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); + CommonCost += TTI->getInstructionCost(I, CostKind); } } for (Value *V : VL) { @@ -4196,8 +4224,8 @@ } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -4929,25 +4957,7 @@ addMask(NewMask); } - void addMask(ArrayRef SubMask) { - if (SubMask.empty()) - return; - if (Mask.empty()) { - Mask.append(SubMask.begin(), SubMask.end()); - return; - } - SmallVector NewMask(SubMask.size(), SubMask.size()); - int TermValue = std::min(Mask.size(), SubMask.size()); - for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) { - NewMask[I] = UndefMaskElem; - continue; - } - NewMask[I] = Mask[SubMask[I]]; - } - Mask.swap(NewMask); - } + void addMask(ArrayRef SubMask) { ::addMask(Mask, SubMask); } Value *finalize(Value *V) { IsFinalized = true; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -S | FileCheck %s --check-prefixes=CHECK,SSE -; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -S | FileCheck %s +; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-- -mattr=avx512vl -S | FileCheck %s declare void @use1(i1) @@ -334,54 +334,31 @@ } define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { -; SSE-LABEL: @logical_and_icmp_clamp_v8i32( -; SSE-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 -; SSE-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 -; SSE-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 -; SSE-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 -; SSE-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 -; SSE-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 -; SSE-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; SSE-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[X0]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[X1]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[X2]], i32 2 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[X3]], i32 3 -; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[Y1]], i32 5 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[Y2]], i32 6 -; SSE-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[Y3]], i32 7 -; SSE-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[SHUFFLE]], [[TMP8]] -; SSE-NEXT: [[TMP10:%.*]] = freeze <8 x i1> [[TMP9]] -; SSE-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP10]]) -; SSE-NEXT: ret i1 [[TMP11]] -; -; AVX512-LABEL: @logical_and_icmp_clamp_v8i32( -; AVX512-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 -; AVX512-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 -; AVX512-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 -; AVX512-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 -; AVX512-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 -; AVX512-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 -; AVX512-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 -; AVX512-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 -; AVX512-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3 -; AVX512-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], -; AVX512-NEXT: [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]] -; AVX512-NEXT: [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]] -; AVX512-NEXT: [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]] -; AVX512-NEXT: [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]] -; AVX512-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] -; AVX512-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) -; AVX512-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false -; AVX512-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false -; AVX512-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false -; AVX512-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false -; AVX512-NEXT: ret i1 [[S7]] +; CHECK-LABEL: @logical_and_icmp_clamp_v8i32( +; CHECK-NEXT: [[X0:%.*]] = extractelement <8 x i32> [[X:%.*]], i32 0 +; CHECK-NEXT: [[X1:%.*]] = extractelement <8 x i32> [[X]], i32 1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <8 x i32> [[X]], i32 2 +; CHECK-NEXT: [[X3:%.*]] = extractelement <8 x i32> [[X]], i32 3 +; CHECK-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 +; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 +; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 +; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], +; CHECK-NEXT: [[D0:%.*]] = icmp slt i32 [[X0]], [[Y0]] +; CHECK-NEXT: [[D1:%.*]] = icmp slt i32 [[X1]], [[Y1]] +; CHECK-NEXT: [[D2:%.*]] = icmp slt i32 [[X2]], [[Y2]] +; CHECK-NEXT: [[D3:%.*]] = icmp slt i32 [[X3]], [[Y3]] +; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) +; CHECK-NEXT: [[S4:%.*]] = select i1 [[TMP7]], i1 [[D0]], i1 false +; CHECK-NEXT: [[S5:%.*]] = select i1 [[S4]], i1 [[D1]], i1 false +; CHECK-NEXT: [[S6:%.*]] = select i1 [[S5]], i1 [[D2]], i1 false +; CHECK-NEXT: [[S7:%.*]] = select i1 [[S6]], i1 [[D3]], i1 false +; CHECK-NEXT: ret i1 [[S7]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -272,18 +272,21 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[V1:%.*]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[V2:%.*]], align 4 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR0]], align 16 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR1]], align 4 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR2]], align 8 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR3]], align 4 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR4]], align 16 ; CHECK-NEXT: [[PTR5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 5 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR5]], align 4 ; CHECK-NEXT: [[PTR6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 6 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR6]], align 8 ; CHECK-NEXT: [[PTR7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 7 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[PTR0]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* [[TMP5]], align 16 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR7]], align 4 ; CHECK-NEXT: ret void ; %1 = load i32, i32* %v1, align 4