Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -406,6 +406,10 @@ /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; + /// Try to convert instructions with extractelement operands into a vector + /// form with the single extractelement instruction. + bool tryToWidenExtractElementInsts(ArrayRef ExtractInsts); + /// \returns True if the VectorizableTree is both tiny and not fully /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); @@ -413,6 +417,11 @@ private: struct TreeEntry; + /// Calculates the cost of the transformation of \p VL instructions from + /// scalar to vector form. + Optional getCost(unsigned Opcode, ArrayRef VL, Type *ScalarTy, + Type *VecTy) const; + /// \returns the cost of the vectorizable entry. int getEntryCost(TreeEntry *E); @@ -1584,6 +1593,108 @@ return N; } +bool BoUpSLP::tryToWidenExtractElementInsts(ArrayRef ExtractInsts) { + bool Changed = false; + // Store the extractelement instruct + tree hight. + SmallVector, 4> Insts; + Insts.reserve(ExtractInsts.size()); + for (auto &V : ExtractInsts) + Insts.emplace_back(V, 0); + for (unsigned Idx = 0, E = Insts.size(); Idx < E; ++Idx) { + auto *EE = dyn_cast(Insts[Idx].first); + // Skip analysis of already deleted extractelements or instruction trees + // with height >= RecursionMaxDepth. + if (!EE || Insts[Idx].second == RecursionMaxDepth) + continue; + unsigned NE = EE->getVectorOperandType()->getNumElements(); + auto *EIdx = EE->getIndexOperand(); + for (auto *U : EE->users()) { + auto *I = dyn_cast(U); + DEBUG(dbgs() << "SLP: trying ti widen instruction " << *I << "\n"); + // Check if user instruction is vectorizable. + if (!I || !isValidElementType(I->getType()) || I->mayHaveSideEffects() || + EphValues.count(I) > 0) + continue; + Optional Cost = getCost(I->getOpcode(), I, I->getType(), + VectorType::get(I->getType(), NE)); + if (!Cost) + continue; + // Check that all of the user instruction are extractelement from the + // vectors of the same size and from the same lanes. + if (!std::all_of(I->op_begin(), I->op_end(), [NE, EIdx](const Value *V) { + auto *EEI = dyn_cast(V); + return EEI && EEI->getVectorOperandType()->getNumElements() == NE && + EEI->getIndexOperand() == EIdx; + })) + continue; + int EIdxVal = -1; + if (auto *EIdxC = dyn_cast(EIdx)) + if (EIdxC->getValue().isNonNegative()) + EIdxVal = EIdxC->getZExtValue(); + // Estimate scalar cost of instructions to be transformed into a vector + // form. + int ScalarCost = 0; + DenseSet EEWithCost; + EEWithCost.reserve(I->getNumOperands()); + for (auto *Op : I->operand_values()) { + auto *EEOp = cast(Op); + const Instruction *UserLast = EEOp->user_back(); + // If the only user of the extractelement instruction is the + // to-be-vectorized-user instruction, include the cost of this + // extractelement into the scalar cost (it safely can be removed during + // vectorization). + // EEWithCost is used to count the cost of the extractelement + // instruction only once. + if (EEWithCost.insert(EEOp).second && + (EEOp->hasOneUse() || + std::all_of(EEOp->user_begin(), EEOp->user_end(), + [UserLast](User *U) { return U == UserLast; }))) { + ScalarCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, + EEOp->getVectorOperandType(), EIdxVal); + } + } + // Get the vector cost of the new vectorized code: vectorized user + // instruction + extractelement , i32 EIdx. + int VecCost = + TTI->getVectorInstrCost(Instruction::ExtractElement, + VectorType::get(I->getType(), NE), EIdxVal); + int ResCost = Cost.getValue() + VecCost - ScalarCost; + if (ResCost >= -SLPCostThreshold) + continue; + DEBUG(dbgs() << "SLP: Decided to widen cost=" << ResCost << "\n"); + // Generate vector code instead of the scalar one. + Builder.SetInsertPoint(I->getParent(), ++I->getIterator()); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + // Create vectorized version of the user instruction. + Instruction *NewI = I->clone(); + NewI->mutateType(VectorType::get(I->getType(), NE)); + for (unsigned Idx = 0, EIdx = NewI->getNumOperands(); Idx < EIdx; ++Idx) { + auto *EE = cast(NewI->getOperand(Idx)); + NewI->setOperand(Idx, EE->getVectorOperand()); + // Remove extractelement instruction only iff this is its last use. + if (EE->hasOneUse()) { + EE->replaceAllUsesWith(UndefValue::get(EE->getType())); + eraseInstruction(EE); + } + } + Builder.Insert(NewI, "widen.vect"); + // %widen.extract = extractelement %widen.vect, i32 Idx + Value *NewEE = Builder.CreateExtractElement(NewI, EIdx, "widen.extract"); + // Replace uses of the scalar instruction by the %widen.extract + // instruction. + I->replaceAllUsesWith(NewEE); + eraseInstruction(I); + // Add %widen.extract to the list of the extractelement instructions for + // future analysis of possibly vectorizable tree. + Insts.emplace_back(NewEE, Insts[Idx].second + 1); + E = Insts.size(); + Changed = true; + } + } + return Changed; +} + bool BoUpSLP::canReuseExtract(ArrayRef VL, unsigned Opcode) const { assert(Opcode == Instruction::ExtractElement || Opcode == Instruction::ExtractValue); @@ -1627,8 +1738,175 @@ return true; } +Optional BoUpSLP::getCost(unsigned Opcode, ArrayRef VL, + Type *ScalarTy, Type *VecTy) const { + assert(ScalarTy && VecTy && + "both ScalarTy/VectorTy parameters must be specified."); + assert(Opcode && "Expected non-null opcode."); + auto *VL0 = cast(VL[0]); + int VecCost; + int ScalarCost; + switch (Opcode) { + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + VecCost = TTI->getCastInstrCost( + Opcode, VecTy, VectorType::get(SrcTy, VecTy->getVectorNumElements())); + + // Calculate the cost of this instruction. + ScalarCost = VL.size() * TTI->getCastInstrCost(Opcode, ScalarTy, SrcTy); + break; + } + case Instruction::FCmp: + case Instruction::ICmp: + case Instruction::Select: { + // Calculate the cost of this instruction. + VecCost = TTI->getCmpSelInstrCost( + Opcode, VecTy, + VectorType::get(Type::getInt1Ty(VL0->getContext()), + VecTy->getVectorNumElements())); + ScalarCost = + VL.size() * TTI->getCmpSelInstrCost(Opcode, ScalarTy, + Type::getInt1Ty(VL0->getContext())); + break; + } + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Certain instructions can be cheaper to vectorize if they have a + // constant second vector operand. + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; + + // If all operands are exactly the same ConstantInt then set the + // operand kind to OK_UniformConstantValue. + // If instead not all operands are constants, then set the operand kind + // to OK_AnyValue. If all operands are constants but not the same, + // then set the operand kind to OK_NonUniformConstantValue. + ConstantInt *CInt = nullptr; + for (unsigned i = 0; i < VL.size(); ++i) { + const Instruction *I = cast(VL[i]); + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (i == 0) { + CInt = cast(I->getOperand(1)); + continue; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; + + VecCost = + TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP); + ScalarCost = VL.size() * TTI->getArithmeticInstrCost( + Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP); + break; + } + case Instruction::GetElementPtr: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_UniformConstantValue; + + VecCost = + TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); + ScalarCost = VL.size() * TTI->getArithmeticInstrCost( + Instruction::Add, ScalarTy, Op1VK, Op2VK); + break; + } + case Instruction::Load: { + // Cost of wide load - cost of scalar loads. + unsigned Alignment = cast(VL0)->getAlignment(); + VecCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, + /*AddressSpace=*/0); + ScalarCost = + VL.size() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, + /*AddressSpace=*/0); + break; + } + case Instruction::Store: { + // We know that we can merge the stores. Calculate the cost. + auto *SI = cast(VL0); + unsigned Alignment = SI->getAlignment(); + VecCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, + /*AddressSpace=*/0); + ScalarCost = + VL.size() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, + Alignment, /*AddressSpace=*/0); + break; + } + case Instruction::Call: { + CallInst *CI = cast(VL0); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + + FastMathFlags FMF; + if (auto *FPMO = dyn_cast(CI)) + FMF = FPMO->getFastMathFlags(); + + // Calculate the cost of the scalar and vector calls. + SmallVector VecTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op) { + VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), + VecTy->getVectorNumElements())); + } + + VecCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + ScalarCost = + VL.size() * TTI->getIntrinsicInstrCost( + ID, ScalarTy, CI->getFunctionType()->params(), FMF); + DEBUG(dbgs() << "SLP: Call cost " << VecCost - ScalarCost << " (" << VecCost + << "-" << ScalarCost << ")" + << " for " << *CI << "\n"); + + break; + } + default: + return None; + } + return VecCost - ScalarCost; +} + int BoUpSLP::getEntryCost(TreeEntry *E) { - ArrayRef VL = E->Scalars; + ArrayRef VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) @@ -1651,218 +1929,67 @@ } unsigned Opcode = getSameOpcode(VL); assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(VL[0]); switch (Opcode) { - case Instruction::PHI: { - return 0; - } - case Instruction::ExtractValue: - case Instruction::ExtractElement: { - if (canReuseExtract(VL, Opcode)) { - int DeadCost = 0; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (E->hasOneUse() || - std::all_of(E->user_begin(), E->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; - })) - // Take credit for instruction that will become dead. - DeadCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); - } - return -DeadCost; - } - return getGatherCost(VecTy); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - Type *SrcTy = VL0->getOperand(0)->getType(); - - // Calculate the cost of this instruction. - int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), - VL0->getType(), SrcTy); - - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); - int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); - return VecCost - ScalarCost; - } - case Instruction::FCmp: - case Instruction::ICmp: - case Instruction::Select: { - // Calculate the cost of this instruction. - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); - return VecCost - ScalarCost; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Certain instructions can be cheaper to vectorize if they have a - // constant second vector operand. - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - TargetTransformInfo::OperandValueProperties Op1VP = - TargetTransformInfo::OP_None; - TargetTransformInfo::OperandValueProperties Op2VP = - TargetTransformInfo::OP_None; - - // If all operands are exactly the same ConstantInt then set the - // operand kind to OK_UniformConstantValue. - // If instead not all operands are constants, then set the operand kind - // to OK_AnyValue. If all operands are constants but not the same, - // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; - } - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; - - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); - return VecCost - ScalarCost; - } - case Instruction::GetElementPtr: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; - - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); - int VecCost = - TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - - return VecCost - ScalarCost; - } - case Instruction::Load: { - // Cost of wide load - cost of scalar loads. - unsigned alignment = dyn_cast(VL0)->getAlignment(); - int ScalarLdCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); - int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, - VecTy, alignment, 0); - if (E->NeedToShuffle) { - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0); + case Instruction::PHI: + return 0; + case Instruction::ExtractValue: + case Instruction::ExtractElement: + if (canReuseExtract(VL, Opcode)) { + int DeadCost = 0; + for (unsigned i = 0, e = VL.size(); i < e; ++i) { + Instruction *E = cast(VL[i]); + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (E->hasOneUse() || + std::all_of(E->user_begin(), E->user_end(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0; + })) + // Take credit for instruction that will become dead. + DeadCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } - return VecLdCost - ScalarLdCost; + return -DeadCost; } - case Instruction::Store: { - // We know that we can merge the stores. Calculate the cost. - unsigned alignment = dyn_cast(VL0)->getAlignment(); - int ScalarStCost = VecTy->getNumElements() * - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0); - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, alignment, 0); - return VecStCost - ScalarStCost; - } - case Instruction::Call: { - CallInst *CI = cast(VL0); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - - // Calculate the cost of the scalar and vector calls. - SmallVector ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { - ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } - - FastMathFlags FMF; - if (auto *FPMO = dyn_cast(CI)) - FMF = FPMO->getFastMathFlags(); - - int ScalarCallCost = VecTy->getNumElements() * - TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); - - DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost - << " (" << VecCallCost << "-" << ScalarCallCost << ")" - << " for " << *CI << "\n"); - - return VecCallCost - ScalarCallCost; + return getGatherCost(VecTy); + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (Value *i : VL) { + Instruction *I = cast(i); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); } - case Instruction::ShuffleVector: { - TargetTransformInfo::OperandValueKind Op1VK = - TargetTransformInfo::OK_AnyValue; - TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_AnyValue; - int ScalarCost = 0; - int VecCost = 0; - for (Value *i : VL) { - Instruction *I = cast(i); - if (!I) - break; - ScalarCost += - TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); - } - // VecCost is equal to sum of the cost of creating 2 vectors - // and the cost of creating shuffle. - Instruction *I0 = cast(VL[0]); - VecCost = - TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); - Instruction *I1 = cast(VL[1]); - VecCost += - TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); - VecCost += - TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); - return VecCost - ScalarCost; + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast(VL[0]); + VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); + return VecCost - ScalarCost; + } + case Instruction::Load: { + int Cost = getCost(Opcode, VL, ScalarTy, VecTy).getValue(); + if (E->NeedToShuffle) { + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, 0); } - default: - llvm_unreachable("Unknown instruction"); + return Cost; + } + default: + if (Optional Cost = getCost(Opcode, VL, ScalarTy, VecTy)) + return Cost.getValue(); + break; } + llvm_unreachable("Unknown instruction"); } bool BoUpSLP::isFullyVectorizableTinyTree() { @@ -4871,6 +4998,7 @@ VisitedInstrs.clear(); + SmallVector ExtractInsts; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) @@ -4986,8 +5114,15 @@ } } } + + if (auto *EE = dyn_cast(it)) { + ExtractInsts.push_back(EE); + continue; + } } + Changed |= R.tryToWidenExtractElementInsts(ExtractInsts); + return Changed; } Index: test/Transforms/SLPVectorizer/X86/vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vector.ll +++ test/Transforms/SLPVectorizer/X86/vector.ll @@ -17,17 +17,15 @@ define i1 @cmpv2f32(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @cmpv2f32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0 -; CHECK-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[X0]], [[Y0]] -; CHECK-NEXT: br i1 [[CMP0]], label [[IF:%.*]], label [[ENDIF:%.*]] +; CHECK-NEXT: [[WIDEN_VECT1:%.*]] = icmp eq <2 x i32> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[WIDEN_EXTRACT2:%.*]] = extractelement <2 x i1> [[WIDEN_VECT1]], i32 0 +; CHECK-NEXT: br i1 [[WIDEN_EXTRACT2]], label [[IF:%.*]], label [[ENDIF:%.*]] ; CHECK: if: -; CHECK-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i32 1 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i32 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[X1]], [[Y1]] +; CHECK-NEXT: [[WIDEN_VECT:%.*]] = icmp eq <2 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[WIDEN_EXTRACT:%.*]] = extractelement <2 x i1> [[WIDEN_VECT]], i32 1 ; CHECK-NEXT: br label [[ENDIF]] ; CHECK: endif: -; CHECK-NEXT: [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP1]], [[IF]] ] +; CHECK-NEXT: [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[WIDEN_EXTRACT]], [[IF]] ] ; CHECK-NEXT: ret i1 [[AND_OF_CMPS]] ; entry: