diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -706,6 +706,14 @@ unsigned AddrSpace = 0, Instruction *I = nullptr) const; + /// Checks if the specified operation with the given vector type is not going + /// to be scalarized. + bool isLegalVectorOp(unsigned, VectorType *) const; + + /// Checks if the specified operation(intrinsic) with the given vector type is + /// not going to be scalarized. + bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const; + /// Return true if LSR cost of C1 is lower than C2. bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const; @@ -1757,6 +1765,10 @@ int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I) = 0; + virtual bool isLegalVectorOp(unsigned, VectorType *) const = 0; + + virtual bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const = 0; + virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) = 0; virtual bool isNumRegsMajorCostOfLSR() = 0; @@ -2198,6 +2210,15 @@ return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, AddrSpace, I); } + bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const override { + return Impl.isLegalVectorOp(Opcode, VecTy); + } + + bool isLegalVectorIntrinsic(Intrinsic::ID Id, + VectorType *VecTy) const override { + return Impl.isLegalVectorIntrinsic(Id, VecTy); + } + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) override { return Impl.isLSRCostLess(C1, C2); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -298,6 +298,12 @@ bool isLegalMaskedExpandLoad(Type *DataType) const { return false; } + bool isLegalVectorOp(unsigned, VectorType *) const { return true; } + + bool isLegalVectorIntrinsic(Intrinsic::ID, VectorType *) const { + return true; + } + bool enableOrderedReductions() const { return false; } bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -342,6 +342,108 @@ return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); } + bool isLegalVectorOp(unsigned Opcode, VectorType *VecTy) const { + int ISD = getTLI()->InstructionOpcodeToISD(Opcode); + EVT VT = getTLI()->getValueType(DL, VecTy); + TargetLoweringBase::LegalizeKind LK = + getTLI()->getTypeConversion(VecTy->getContext(), VT); + return LK.first != TargetLoweringBase::TypeScalarizeVector && + getTLI()->getOperationAction(ISD, LK.second) != + TargetLowering::Expand; + } + + static unsigned intrinsicIdToISD(Intrinsic::ID IID) { + switch (IID) { + default: + break; + case Intrinsic::sqrt: + return ISD::FSQRT; + case Intrinsic::sin: + return ISD::FSIN; + case Intrinsic::cos: + return ISD::FCOS; + case Intrinsic::exp: + return ISD::FEXP; + case Intrinsic::exp2: + return ISD::FEXP2; + case Intrinsic::log: + return ISD::FLOG; + case Intrinsic::log10: + return ISD::FLOG10; + case Intrinsic::log2: + return ISD::FLOG2; + case Intrinsic::fabs: + return ISD::FABS; + case Intrinsic::canonicalize: + return ISD::FCANONICALIZE; + case Intrinsic::minnum: + return ISD::FMINNUM; + case Intrinsic::maxnum: + return ISD::FMAXNUM; + case Intrinsic::minimum: + return ISD::FMINIMUM; + case Intrinsic::maximum: + return ISD::FMAXIMUM; + case Intrinsic::copysign: + return ISD::FCOPYSIGN; + case Intrinsic::floor: + return ISD::FFLOOR; + case Intrinsic::ceil: + return ISD::FCEIL; + case Intrinsic::trunc: + return ISD::FTRUNC; + case Intrinsic::nearbyint: + return ISD::FNEARBYINT; + case Intrinsic::rint: + return ISD::FRINT; + case Intrinsic::round: + return ISD::FROUND; + case Intrinsic::roundeven: + return ISD::FROUNDEVEN; + case Intrinsic::pow: + return ISD::FPOW; + case Intrinsic::fma: + return ISD::FMA; + case Intrinsic::fmuladd: + return ISD::FMA; + case Intrinsic::experimental_constrained_fmuladd: + return ISD::STRICT_FMA; + case Intrinsic::ctpop: + return ISD::CTPOP; + case Intrinsic::ctlz: + return ISD::CTLZ; + case Intrinsic::cttz: + return ISD::CTTZ; + case Intrinsic::bswap: + return ISD::BSWAP; + case Intrinsic::bitreverse: + return ISD::BITREVERSE; + } + return ISD::DELETED_NODE; + } + + bool isLegalVectorIntrinsic(Intrinsic::ID Id, VectorType *VecTy) const { + unsigned ISD = intrinsicIdToISD(Id); + switch (intrinsicIdToISD(Id)) { + default: + return true; + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FSQRT: + break; + } + + EVT VT = getTLI()->getValueType(DL, VecTy); + return getTLI()->getTypeAction(VecTy->getContext(), VT) != + TargetLoweringBase::TypeScalarizeVector && + getTLI()->getOperationAction(ISD, VT) != TargetLowering::Expand; + } + unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const { auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { @@ -1727,404 +1829,320 @@ // Library call cost - other than size, make it expensive. unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; - unsigned ISD = 0; - switch (IID) { - default: { - // Scalable vectors cannot be scalarized, so return Invalid. - if (isa(RetTy) || any_of(Tys, [](const Type *Ty) { - return isa(Ty); - })) - return InstructionCost::getInvalid(); - - // Assume that we need to scalarize this intrinsic. - InstructionCost ScalarizationCost = - SkipScalarizationCost ? ScalarizationCostPassed : 0; - unsigned ScalarCalls = 1; - Type *ScalarRetTy = RetTy; - if (auto *RetVTy = dyn_cast(RetTy)) { - if (!SkipScalarizationCost) - ScalarizationCost = getScalarizationOverhead( - RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); - ScalarCalls = std::max(ScalarCalls, - cast(RetVTy)->getNumElements()); - ScalarRetTy = RetTy->getScalarType(); - } - SmallVector ScalarTys; - for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { - Type *Ty = Tys[i]; - if (auto *VTy = dyn_cast(Ty)) { + // Look for intrinsics that can be lowered directly or turned into a + // scalar intrinsic call. + unsigned ISD = intrinsicIdToISD(IID); + if (ISD == ISD::DELETED_NODE) { + switch (IID) { + default: { + // Scalable vectors cannot be scalarized, so return Invalid. + if (isa(RetTy) || any_of(Tys, [](const Type *Ty) { + return isa(Ty); + })) + return InstructionCost::getInvalid(); + + // Assume that we need to scalarize this intrinsic. + InstructionCost ScalarizationCost = + SkipScalarizationCost ? ScalarizationCostPassed : 0; + unsigned ScalarCalls = 1; + Type *ScalarRetTy = RetTy; + if (auto *RetVTy = dyn_cast(RetTy)) { if (!SkipScalarizationCost) - ScalarizationCost += getScalarizationOverhead( - VTy, /*Insert*/ false, /*Extract*/ true, CostKind); - ScalarCalls = std::max(ScalarCalls, - cast(VTy)->getNumElements()); - Ty = Ty->getScalarType(); + ScalarizationCost = getScalarizationOverhead( + RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); + ScalarCalls = std::max( + ScalarCalls, cast(RetVTy)->getNumElements()); + ScalarRetTy = RetTy->getScalarType(); } - ScalarTys.push_back(Ty); - } - if (ScalarCalls == 1) - return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. - - IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); - InstructionCost ScalarCost = - thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); - - return ScalarCalls * ScalarCost + ScalarizationCost; - } - // Look for intrinsics that can be lowered directly or turned into a scalar - // intrinsic call. - case Intrinsic::sqrt: - ISD = ISD::FSQRT; - break; - case Intrinsic::sin: - ISD = ISD::FSIN; - break; - case Intrinsic::cos: - ISD = ISD::FCOS; - break; - case Intrinsic::exp: - ISD = ISD::FEXP; - break; - case Intrinsic::exp2: - ISD = ISD::FEXP2; - break; - case Intrinsic::log: - ISD = ISD::FLOG; - break; - case Intrinsic::log10: - ISD = ISD::FLOG10; - break; - case Intrinsic::log2: - ISD = ISD::FLOG2; - break; - case Intrinsic::fabs: - ISD = ISD::FABS; - break; - case Intrinsic::canonicalize: - ISD = ISD::FCANONICALIZE; - break; - case Intrinsic::minnum: - ISD = ISD::FMINNUM; - break; - case Intrinsic::maxnum: - ISD = ISD::FMAXNUM; - break; - case Intrinsic::minimum: - ISD = ISD::FMINIMUM; - break; - case Intrinsic::maximum: - ISD = ISD::FMAXIMUM; - break; - case Intrinsic::copysign: - ISD = ISD::FCOPYSIGN; - break; - case Intrinsic::floor: - ISD = ISD::FFLOOR; - break; - case Intrinsic::ceil: - ISD = ISD::FCEIL; - break; - case Intrinsic::trunc: - ISD = ISD::FTRUNC; - break; - case Intrinsic::nearbyint: - ISD = ISD::FNEARBYINT; - break; - case Intrinsic::rint: - ISD = ISD::FRINT; - break; - case Intrinsic::round: - ISD = ISD::FROUND; - break; - case Intrinsic::roundeven: - ISD = ISD::FROUNDEVEN; - break; - case Intrinsic::pow: - ISD = ISD::FPOW; - break; - case Intrinsic::fma: - ISD = ISD::FMA; - break; - case Intrinsic::fmuladd: - ISD = ISD::FMA; - break; - case Intrinsic::experimental_constrained_fmuladd: - ISD = ISD::STRICT_FMA; - break; - // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - case Intrinsic::sideeffect: - case Intrinsic::pseudoprobe: - case Intrinsic::arithmetic_fence: - return 0; - case Intrinsic::masked_store: { - Type *Ty = Tys[0]; - Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, - CostKind); - } - case Intrinsic::masked_load: { - Type *Ty = RetTy; - Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, - CostKind); - } - case Intrinsic::vector_reduce_add: - return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, - std::nullopt, CostKind); - case Intrinsic::vector_reduce_mul: - return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, - std::nullopt, CostKind); - case Intrinsic::vector_reduce_and: - return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, - std::nullopt, CostKind); - case Intrinsic::vector_reduce_or: - return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, - std::nullopt, CostKind); - case Intrinsic::vector_reduce_xor: - return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, - std::nullopt, CostKind); - case Intrinsic::vector_reduce_fadd: - return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, - FMF, CostKind); - case Intrinsic::vector_reduce_fmul: - return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, - FMF, CostKind); - case Intrinsic::vector_reduce_smax: - return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_smin: - return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_umax: - return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_umin: - return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_fmax: - return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_fmin: - return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_fmaximum: - return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::vector_reduce_fminimum: - return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy, - ICA.getFlags(), CostKind); - case Intrinsic::abs: { - // abs(X) = select(icmp(X,0),X,sub(0,X)) - Type *CondTy = RetTy->getWithNewBitWidth(1); - CmpInst::Predicate Pred = CmpInst::ICMP_SGT; - InstructionCost Cost = 0; - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, - Pred, CostKind); - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, - Pred, CostKind); - // TODO: Should we add an OperandValueProperties::OP_Zero property? - Cost += thisT()->getArithmeticInstrCost( - BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); - return Cost; - } - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::umax: - case Intrinsic::umin: { - // minmax(X,Y) = select(icmp(X,Y),X,Y) - Type *CondTy = RetTy->getWithNewBitWidth(1); - bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; - CmpInst::Predicate Pred = - IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; - InstructionCost Cost = 0; - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, - Pred, CostKind); - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, - Pred, CostKind); - return Cost; - } - case Intrinsic::sadd_sat: - case Intrinsic::ssub_sat: { - Type *CondTy = RetTy->getWithNewBitWidth(1); - - Type *OpTy = StructType::create({RetTy, CondTy}); - Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat - ? Intrinsic::sadd_with_overflow - : Intrinsic::ssub_with_overflow; - CmpInst::Predicate Pred = CmpInst::ICMP_SGT; - - // SatMax -> Overflow && SumDiff < 0 - // SatMin -> Overflow && SumDiff >= 0 - InstructionCost Cost = 0; - IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, - nullptr, ScalarizationCostPassed); - Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); - Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, - Pred, CostKind); - Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, - CondTy, Pred, CostKind); - return Cost; - } - case Intrinsic::uadd_sat: - case Intrinsic::usub_sat: { - Type *CondTy = RetTy->getWithNewBitWidth(1); + SmallVector ScalarTys; + for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { + Type *Ty = Tys[i]; + if (auto *VTy = dyn_cast(Ty)) { + if (!SkipScalarizationCost) + ScalarizationCost += getScalarizationOverhead( + VTy, /*Insert*/ false, /*Extract*/ true, CostKind); + ScalarCalls = std::max( + ScalarCalls, cast(VTy)->getNumElements()); + Ty = Ty->getScalarType(); + } + ScalarTys.push_back(Ty); + } + if (ScalarCalls == 1) + return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. - Type *OpTy = StructType::create({RetTy, CondTy}); - Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat - ? Intrinsic::uadd_with_overflow - : Intrinsic::usub_with_overflow; + IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); + InstructionCost ScalarCost = + thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); - InstructionCost Cost = 0; - IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, - nullptr, ScalarizationCostPassed); - Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); - Cost += - thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); - return Cost; - } - case Intrinsic::smul_fix: - case Intrinsic::umul_fix: { - unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; - Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); + return ScalarCalls * ScalarCost + ScalarizationCost; + } + // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::sideeffect: + case Intrinsic::pseudoprobe: + case Intrinsic::arithmetic_fence: + return 0; + case Intrinsic::masked_store: { + Type *Ty = Tys[0]; + Align TyAlign = thisT()->DL.getABITypeAlign(Ty); + return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, + 0, CostKind); + } + case Intrinsic::masked_load: { + Type *Ty = RetTy; + Align TyAlign = thisT()->DL.getABITypeAlign(Ty); + return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, + CostKind); + } + case Intrinsic::vector_reduce_add: + return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, + std::nullopt, CostKind); + case Intrinsic::vector_reduce_mul: + return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, + std::nullopt, CostKind); + case Intrinsic::vector_reduce_and: + return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, + std::nullopt, CostKind); + case Intrinsic::vector_reduce_or: + return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, + std::nullopt, CostKind); + case Intrinsic::vector_reduce_xor: + return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, + std::nullopt, CostKind); + case Intrinsic::vector_reduce_fadd: + return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, + FMF, CostKind); + case Intrinsic::vector_reduce_fmul: + return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, + FMF, CostKind); + case Intrinsic::vector_reduce_smax: + return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_smin: + return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_umax: + return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_umin: + return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_fmax: + return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_fmin: + return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_fmaximum: + return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::vector_reduce_fminimum: + return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy, + ICA.getFlags(), CostKind); + case Intrinsic::abs: { + // abs(X) = select(icmp(X,0),X,sub(0,X)) + Type *CondTy = RetTy->getWithNewBitWidth(1); + CmpInst::Predicate Pred = CmpInst::ICMP_SGT; + InstructionCost Cost = 0; + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, + Pred, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, + CondTy, Pred, CostKind); + // TODO: Should we add an OperandValueProperties::OP_Zero property? + Cost += thisT()->getArithmeticInstrCost( + BinaryOperator::Sub, RetTy, CostKind, + {TTI::OK_UniformConstantValue, TTI::OP_None}); + return Cost; + } + case Intrinsic::smax: + case Intrinsic::smin: + case Intrinsic::umax: + case Intrinsic::umin: { + // minmax(X,Y) = select(icmp(X,Y),X,Y) + Type *CondTy = RetTy->getWithNewBitWidth(1); + bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; + CmpInst::Predicate Pred = + IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; + InstructionCost Cost = 0; + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, + Pred, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, + CondTy, Pred, CostKind); + return Cost; + } + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: { + Type *CondTy = RetTy->getWithNewBitWidth(1); - unsigned ExtOp = - IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; - TTI::CastContextHint CCH = TTI::CastContextHint::None; + Type *OpTy = StructType::create({RetTy, CondTy}); + Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat + ? Intrinsic::sadd_with_overflow + : Intrinsic::ssub_with_overflow; + CmpInst::Predicate Pred = CmpInst::ICMP_SGT; + + // SatMax -> Overflow && SumDiff < 0 + // SatMin -> Overflow && SumDiff >= 0 + InstructionCost Cost = 0; + IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, + nullptr, ScalarizationCostPassed); + Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, + Pred, CostKind); + Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, + CondTy, Pred, CostKind); + return Cost; + } + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: { + Type *CondTy = RetTy->getWithNewBitWidth(1); - InstructionCost Cost = 0; - Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); - Cost += - thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); - Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, - CCH, CostKind); - Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, - CostKind, - {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_UniformConstantValue, TTI::OP_None}); - Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, - {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_UniformConstantValue, TTI::OP_None}); - Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); - return Cost; - } - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: { - Type *SumTy = RetTy->getContainedType(0); - Type *OverflowTy = RetTy->getContainedType(1); - unsigned Opcode = IID == Intrinsic::sadd_with_overflow - ? BinaryOperator::Add - : BinaryOperator::Sub; - - // Add: - // Overflow -> (Result < LHS) ^ (RHS < 0) - // Sub: - // Overflow -> (Result < LHS) ^ (RHS > 0) - InstructionCost Cost = 0; - Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); - Cost += 2 * thisT()->getCmpSelInstrCost( - Instruction::ICmp, SumTy, OverflowTy, - CmpInst::ICMP_SGT, CostKind); - Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, - CostKind); - return Cost; - } - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: { - Type *SumTy = RetTy->getContainedType(0); - Type *OverflowTy = RetTy->getContainedType(1); - unsigned Opcode = IID == Intrinsic::uadd_with_overflow - ? BinaryOperator::Add - : BinaryOperator::Sub; - CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow - ? CmpInst::ICMP_ULT - : CmpInst::ICMP_UGT; + Type *OpTy = StructType::create({RetTy, CondTy}); + Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat + ? Intrinsic::uadd_with_overflow + : Intrinsic::usub_with_overflow; - InstructionCost Cost = 0; - Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); - Cost += - thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, - Pred, CostKind); - return Cost; - } - case Intrinsic::smul_with_overflow: - case Intrinsic::umul_with_overflow: { - Type *MulTy = RetTy->getContainedType(0); - Type *OverflowTy = RetTy->getContainedType(1); - unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; - Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); - bool IsSigned = IID == Intrinsic::smul_with_overflow; + InstructionCost Cost = 0; + IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, + nullptr, ScalarizationCostPassed); + Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); + Cost += + thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + return Cost; + } + case Intrinsic::smul_fix: + case Intrinsic::umul_fix: { + unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; + Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); - unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; - TTI::CastContextHint CCH = TTI::CastContextHint::None; + unsigned ExtOp = + IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; + TTI::CastContextHint CCH = TTI::CastContextHint::None; - InstructionCost Cost = 0; - Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); - Cost += - thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); - Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, - CCH, CostKind); - Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, - CostKind, - {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_UniformConstantValue, TTI::OP_None}); - - if (IsSigned) - Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, - CostKind, - {TTI::OK_AnyValue, TTI::OP_None}, - {TTI::OK_UniformConstantValue, TTI::OP_None}); - - Cost += thisT()->getCmpSelInstrCost( - BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); - return Cost; - } - case Intrinsic::fptosi_sat: - case Intrinsic::fptoui_sat: { - if (Tys.empty()) - break; - Type *FromTy = Tys[0]; - bool IsSigned = IID == Intrinsic::fptosi_sat; + InstructionCost Cost = 0; + Cost += + 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); + Cost += + thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); + Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, + CCH, CostKind); + Cost += thisT()->getArithmeticInstrCost( + Instruction::LShr, RetTy, CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_UniformConstantValue, TTI::OP_None}); + Cost += thisT()->getArithmeticInstrCost( + Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_UniformConstantValue, TTI::OP_None}); + Cost += + thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); + return Cost; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: { + Type *SumTy = RetTy->getContainedType(0); + Type *OverflowTy = RetTy->getContainedType(1); + unsigned Opcode = IID == Intrinsic::sadd_with_overflow + ? BinaryOperator::Add + : BinaryOperator::Sub; + + // Add: + // Overflow -> (Result < LHS) ^ (RHS < 0) + // Sub: + // Overflow -> (Result < LHS) ^ (RHS > 0) + InstructionCost Cost = 0; + Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); + Cost += 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, + OverflowTy, CmpInst::ICMP_SGT, + CostKind); + Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, + CostKind); + return Cost; + } + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: { + Type *SumTy = RetTy->getContainedType(0); + Type *OverflowTy = RetTy->getContainedType(1); + unsigned Opcode = IID == Intrinsic::uadd_with_overflow + ? BinaryOperator::Add + : BinaryOperator::Sub; + CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow + ? CmpInst::ICMP_ULT + : CmpInst::ICMP_UGT; + + InstructionCost Cost = 0; + Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); + Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, + OverflowTy, Pred, CostKind); + return Cost; + } + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: { + Type *MulTy = RetTy->getContainedType(0); + Type *OverflowTy = RetTy->getContainedType(1); + unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; + Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); + bool IsSigned = IID == Intrinsic::smul_with_overflow; + + unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; + TTI::CastContextHint CCH = TTI::CastContextHint::None; + + InstructionCost Cost = 0; + Cost += + 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); + Cost += + thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); + Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, + CCH, CostKind); + Cost += thisT()->getArithmeticInstrCost( + Instruction::LShr, ExtTy, CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_UniformConstantValue, TTI::OP_None}); + + if (IsSigned) + Cost += thisT()->getArithmeticInstrCost( + Instruction::AShr, MulTy, CostKind, + {TTI::OK_AnyValue, TTI::OP_None}, + {TTI::OK_UniformConstantValue, TTI::OP_None}); - InstructionCost Cost = 0; - IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, - {FromTy, FromTy}); - Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); - IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, - {FromTy, FromTy}); - Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); - Cost += thisT()->getCastInstrCost( - IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, - TTI::CastContextHint::None, CostKind); - if (IsSigned) { - Type *CondTy = RetTy->getWithNewBitWidth(1); - Cost += thisT()->getCmpSelInstrCost( - BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); - Cost += thisT()->getCmpSelInstrCost( - BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); + Cost += + thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy, + CmpInst::ICMP_NE, CostKind); + return Cost; } - return Cost; - } - case Intrinsic::ctpop: - ISD = ISD::CTPOP; + case Intrinsic::fptosi_sat: + case Intrinsic::fptoui_sat: { + if (Tys.empty()) + break; + Type *FromTy = Tys[0]; + bool IsSigned = IID == Intrinsic::fptosi_sat; + + InstructionCost Cost = 0; + IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, + {FromTy, FromTy}); + Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); + IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, + {FromTy, FromTy}); + Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); + Cost += thisT()->getCastInstrCost( + IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, + TTI::CastContextHint::None, CostKind); + if (IsSigned) { + Type *CondTy = RetTy->getWithNewBitWidth(1); + Cost += + thisT()->getCmpSelInstrCost(BinaryOperator::FCmp, FromTy, CondTy, + CmpInst::FCMP_UNO, CostKind); + Cost += + thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, + CmpInst::FCMP_UNO, CostKind); + } + return Cost; + } + } + } else if (ISD == ISD::CTPOP) { // In case of legalization use TCC_Expensive. This is cheaper than a // library call but still not a cheap instruction. SingleCallCost = TargetTransformInfo::TCC_Expensive; - break; - case Intrinsic::ctlz: - ISD = ISD::CTLZ; - break; - case Intrinsic::cttz: - ISD = ISD::CTTZ; - break; - case Intrinsic::bswap: - ISD = ISD::BSWAP; - break; - case Intrinsic::bitreverse: - ISD = ISD::BITREVERSE; - break; } const TargetLoweringBase *TLI = getTLI(); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -393,6 +393,16 @@ Scale, AddrSpace, I); } +bool TargetTransformInfo::isLegalVectorOp(unsigned Opcode, + VectorType *VecTy) const { + return TTIImpl->isLegalVectorOp(Opcode, VecTy); +} + +bool TargetTransformInfo::isLegalVectorIntrinsic(Intrinsic::ID Id, + VectorType *VecTy) const { + return TTIImpl->isLegalVectorIntrinsic(Id, VecTy); +} + bool TargetTransformInfo::isLSRCostLess(const LSRCost &C1, const LSRCost &C2) const { return TTIImpl->isLSRCostLess(C1, C2); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1193,6 +1193,13 @@ UserIgnoreList = nullptr; PostponedGathers.clear(); ValueToGatherNodes.clear(); + OperandsToVectorize.clear(); + } + + /// Returns the list of the operands to try to vectorize later, if the user + /// node was not vectorized. + ArrayRef> operandsToVectorize() const { + return OperandsToVectorize; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -2431,6 +2438,10 @@ bool areAllUsersVectorized(Instruction *I, ArrayRef VectorizedVals) const; + /// Checks if the list of the values worth to be vectorized and not going to + /// be scalarized later. + bool isLegalVectorOp(ArrayRef VL); + /// Return information about the vector formed for the specified index /// of a vector of (the same) instruction. TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef VL, @@ -2950,6 +2961,10 @@ /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; + /// A list of the operands of the nodes, which are not vectorized. These + /// operands are the candidates for the vectorization later. + SmallVector> OperandsToVectorize; + /// A map between the vectorized entries and the last instructions in the /// bundles. The bundles are built in use order, not in the def order of the /// instructions. So, we cannot rely directly on the last instruction in the @@ -5452,6 +5467,21 @@ if (!TryToFindDuplicates(S)) return; + // Check if the generated vector instruction won't be scalarized later. + if (!isLegalVectorOp(VL)) { + LLVM_DEBUG(dbgs() << "SLP: scalarized bundle starting " << *S.OpValue + << ".\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + // Gather operands to try to vectorize them later. + for (unsigned I = 0, End = S.MainOp->getNumOperands(); I < End; ++I) { + auto &Operands = OperandsToVectorize.emplace_back(); + for (Value *V : VL) + Operands.push_back(cast(V)->getOperand(I)); + } + return; + } + auto &BSRef = BlocksSchedules[BB]; if (!BSRef) BSRef = std::make_unique(BB); @@ -6370,6 +6400,75 @@ return I->getOpcode() == AltOp->getOpcode(); } +bool BoUpSLP::isLegalVectorOp(ArrayRef VL) { + InstructionsState S = getSameOpcode(VL, *TLI); + const unsigned Sz = VL.size(); + Value *V0 = VL.front(); + Type *ScalarTy = V0->getType(); + if (isa(V0)) + return true; + if (auto *CI = dyn_cast(V0)) + ScalarTy = CI->getOperand(0)->getType(); + else if (auto *CI = dyn_cast(V0)) + if (!isa(CI)) + ScalarTy = CI->getSrcTy(); + if (!isValidElementType(ScalarTy)) + return false; + auto *VecTy = FixedVectorType::get(ScalarTy, Sz); + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + const auto It = MinBWs.find(VL[0]); + if (It != MinBWs.end()) + VecTy = FixedVectorType::get( + IntegerType::get(F->getContext(), It->second.first), VL.size()); + + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); + switch (ShuffleOrOp) { + case Instruction::URem: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::SDiv: { + // Check if it can be represented as shift + TTI::OperandValueInfo OVI = getOperandInfo(VL, 1); + if (OVI.isConstant()) + return true; + return TTI->isLegalVectorOp(ShuffleOrOp, VecTy); + } + case Instruction::Mul: { + // Check if it can be represented as shift + TTI::OperandValueInfo OVI = getOperandInfo(VL, 1); + if (OVI.isConstant()) + return true; + return TTI->isLegalVectorOp(ShuffleOrOp, VecTy); + } + case Instruction::FNeg: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return TTI->isLegalVectorOp(ShuffleOrOp, VecTy); + case Instruction::Call: { + auto *CI = cast(V0); + auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); + return (VecCallCosts.first > VecCallCosts.second || + TTI->isLegalVectorIntrinsic(CI->getIntrinsicID(), VecTy)); + } + default: + return true; + } +} + TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef VL, unsigned OpIdx) { assert(!VL.empty()); @@ -12223,6 +12322,51 @@ return Changed; } +static bool vectorizeOperands(BoUpSLP &R) { + SmallVector> Operands(R.operandsToVectorize().begin(), + R.operandsToVectorize().end()); + DenseSet VisitedOperands; + bool Changed = false; + while (!Operands.empty()) { + SmallVector Chain = Operands.pop_back_val(); + if (!VisitedOperands.insert(hash_value(ArrayRef(Chain))).second) + continue; + unsigned VF = Chain.size(); + R.buildTree(Chain); + if (R.isTreeTinyAndNotFullyVectorizable()) + return false; + if (R.isLoadCombineCandidate()) + return false; + R.reorderTopToBottom(); + R.reorderBottomToTop(); + R.buildExternalUses(); + + R.computeMinimumValueSizes(); + + InstructionCost Cost = R.getTreeCost(); + + LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF + << "\n"); + if (Cost < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n"); + + using namespace ore; + + R.getORE()->emit(OptimizationRemark(SV_NAME, "OperandsVectorized", + cast(Chain[0])) + << "Operands SLP vectorized with cost " + << NV("Cost", Cost) << " and with tree size " + << NV("TreeSize", R.getTreeSize())); + + R.vectorizeTree(); + Changed = true; + } + Operands.append(R.operandsToVectorize().begin(), + R.operandsToVectorize().end()); + } + return Changed; +} + bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx, unsigned MinVF) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -12262,10 +12406,11 @@ << NV("TreeSize", R.getTreeSize())); R.vectorizeTree(); + (void)vectorizeOperands(R); return true; } - return false; + return vectorizeOperands(R); } bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, @@ -12560,6 +12705,7 @@ NextInst = I + 1; Changed = true; } + Changed |= vectorizeOperands(R); } } @@ -13573,6 +13719,7 @@ // Vectorize a tree. Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues, ReplacedExternals, InsertPt); + (void)vectorizeOperands(V); Builder.SetInsertPoint(InsertPt); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -24,11 +24,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -997,11 +999,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -24,11 +24,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -997,11 +999,13 @@ ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) ; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]]) +; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -6,12 +6,26 @@ ; Simple 3-pair chain with loads and stores define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) { -; GCN-LABEL: @test1_as_3_3_3_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @test1_as_3_3_3_v2f16( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @test1_as_3_3_3_v2f16( +; VI-NEXT: [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2 +; VI-NEXT: [[I1:%.*]] = load half, ptr addrspace(3) [[B:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2 +; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1 +; VI-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]] +; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 %i1 = load half, ptr addrspace(3) %b, align 2 @@ -28,12 +42,26 @@ } define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) { -; GCN-LABEL: @test1_as_3_0_0( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: store <2 x half> [[TMP5]], ptr [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @test1_as_3_0_0( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @test1_as_3_0_0( +; VI-NEXT: [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2 +; VI-NEXT: [[I1:%.*]] = load half, ptr [[B:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2 +; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1 +; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]] +; VI-NEXT: store half [[MUL]], ptr [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 %i1 = load half, ptr %b, align 2 @@ -50,12 +78,26 @@ } define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) { -; GCN-LABEL: @test1_as_0_0_3_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = fmul <2 x half> [[TMP2]], [[TMP4]] -; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @test1_as_0_0_3_v2f16( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2 +; GFX9-NEXT: [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]] +; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @test1_as_0_0_3_v2f16( +; VI-NEXT: [[I0:%.*]] = load half, ptr [[A:%.*]], align 2 +; VI-NEXT: [[I1:%.*]] = load half, ptr [[B:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[I1]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX3]], align 2 +; VI-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr [[B]], i64 1 +; VI-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX4]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[I4]] +; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr %a, align 2 %i1 = load half, ptr %b, align 2 @@ -73,11 +115,11 @@ define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { ; GCN-LABEL: @test1_fma_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 -; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) -; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 +; GCN-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]]) +; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -98,13 +140,24 @@ } define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) { -; GCN-LABEL: @mul_scalar_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 -; GCN-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x half> [[TMP3]], <2 x half> poison, <2 x i32> zeroinitializer -; GCN-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP2]], [[SHUFFLE]] -; GCN-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: ret void +; GFX9-LABEL: @mul_scalar_v2f16( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0 +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer +; GFX9-NEXT: [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]] +; GFX9-NEXT: store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: ret void +; +; VI-LABEL: @mul_scalar_v2f16( +; VI-NEXT: [[I0:%.*]] = load half, ptr addrspace(3) [[A:%.*]], align 2 +; VI-NEXT: [[MUL:%.*]] = fmul half [[I0]], [[SCALAR:%.*]] +; VI-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[A]], i64 1 +; VI-NEXT: [[I3:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX3]], align 2 +; VI-NEXT: [[MUL5:%.*]] = fmul half [[I3]], [[SCALAR]] +; VI-NEXT: store half [[MUL]], ptr addrspace(3) [[C:%.*]], align 2 +; VI-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[C]], i64 1 +; VI-NEXT: store half [[MUL5]], ptr addrspace(3) [[ARRAYIDX5]], align 2 +; VI-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 %mul = fmul half %i0, %scalar @@ -119,9 +172,9 @@ define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) { ; GCN-LABEL: @fabs_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]]) -; GCN-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]]) +; GCN-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -137,12 +190,12 @@ define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) { ; GCN-LABEL: @test1_fabs_fma_v2f16( -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 -; GCN-NEXT: [[TMP6:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP2]]) -; GCN-NEXT: [[TMP8:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP7]], <2 x half> [[TMP4]], <2 x half> [[TMP6]]) -; GCN-NEXT: store <2 x half> [[TMP8]], ptr addrspace(3) [[D:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2 +; GCN-NEXT: [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]]) +; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]]) +; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -172,12 +225,12 @@ ; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]]) ; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1 ; GCN-NEXT: [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2 -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 -; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 -; GCN-NEXT: [[TMP6:%.*]] = insertelement <2 x half> [[TMP5]], half [[I4]], i32 1 -; GCN-NEXT: [[TMP7:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP2]], <2 x half> [[TMP6]], <2 x half> [[TMP4]]) -; GCN-NEXT: store <2 x half> [[TMP7]], ptr addrspace(3) [[D:%.*]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2 +; GCN-NEXT: [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 +; GCN-NEXT: [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1 +; GCN-NEXT: [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]]) +; GCN-NEXT: store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2 ; GCN-NEXT: ret void ; %i0 = load half, ptr addrspace(3) %a, align 2 @@ -201,9 +254,9 @@ define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) { ; GFX9-LABEL: @canonicalize_v2f16( -; GFX9-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 -; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP2]]) -; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2 +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2 +; GFX9-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]]) +; GFX9-NEXT: store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2 ; GFX9-NEXT: ret void ; ; VI-LABEL: @canonicalize_v2f16( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll @@ -208,11 +208,13 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINS_31]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; DEFAULT-LABEL: define <4 x float> @int_exp_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -224,11 +226,13 @@ ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -314,11 +318,13 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINS_31]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; DEFAULT-LABEL: define <4 x float> @int_log_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -330,11 +336,13 @@ ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 @@ -420,11 +428,13 @@ ; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[VECINS_31]] +; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; CHECK-NEXT: ret <4 x float> [[VECINS_3]] ; ; DEFAULT-LABEL: define <4 x float> @int_sin_4x ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] { @@ -436,11 +446,13 @@ ; DEFAULT-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 ; DEFAULT-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) ; DEFAULT-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 -; DEFAULT-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> -; DEFAULT-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; DEFAULT-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; DEFAULT-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> -; DEFAULT-NEXT: ret <4 x float> [[VECINS_31]] +; DEFAULT-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 +; DEFAULT-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]]) +; DEFAULT-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2 +; DEFAULT-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 +; DEFAULT-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) +; DEFAULT-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3 +; DEFAULT-NEXT: ret <4 x float> [[VECINS_3]] ; entry: %0 = load <4 x float>, ptr %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div-undef.ll @@ -3,7 +3,21 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { ; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: ret <8 x i32> poison +; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1 +; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5 +; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[R71]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll @@ -8,10 +8,10 @@ ; Base case with no interesting control dependencies define void @test_no_control(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test_no_control( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -34,11 +34,11 @@ ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -60,11 +60,11 @@ ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %c1 = load i64, ptr %c @@ -87,11 +87,11 @@ ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -113,11 +113,11 @@ ; CHECK-LABEL: @test4( ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -139,11 +139,11 @@ ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[C2:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %a2 = getelementptr i64, ptr %a, i32 1 @@ -164,10 +164,10 @@ define void @test6(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP3]], [[TMP5]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -200,11 +200,11 @@ ; CHECK-NEXT: store i64 0, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -233,11 +233,11 @@ ; CHECK-NEXT: store i64 0, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -266,11 +266,11 @@ ; CHECK-NEXT: store i64 0, ptr [[A]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_throw() ; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[V2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -294,18 +294,18 @@ ; A variant of test7 which shows the same problem with a non-load instruction define void @test10(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: @test10( -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1 -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 4 -; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[V1]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: [[U1:%.*]] = udiv i64 200, [[TMP2]] ; CHECK-NEXT: store i64 [[U1]], ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() -; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[V2]] -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @may_inf_loop_ro() +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[U2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP7]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: ret void ; %v1 = load i64, ptr %a @@ -337,11 +337,11 @@ ; CHECK-NEXT: store i64 [[U1]], ptr [[B:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @may_inf_loop_ro() ; CHECK-NEXT: [[U2:%.*]] = udiv i64 200, [[Y:%.*]] -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[U2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP5]], [[TMP3]] -; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[B]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[B]], align 4 ; CHECK-NEXT: ret void ; %u1 = udiv i64 200, %x diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll @@ -10,19 +10,26 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[D0:%.*]] = sdiv i64 [[P0]], [[P0]] +; CHECK-NEXT: [[D1:%.*]] = sdiv i64 [[P1]], [[P1]] +; CHECK-NEXT: [[D2:%.*]] = sdiv i64 [[P2]], [[P2]] +; CHECK-NEXT: [[D3:%.*]] = sdiv i64 [[P3]], [[P3]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> [[TMP6]], i64 [[D1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> [[TMP7]], i64 [[D2]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> [[TMP8]], i64 [[D3]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i64> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shl <4 x i64> [[TMP4]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[D0]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[D1]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i64> [[TMP13]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32> ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP19:%.*]], [[BB]] ], [ [[TMP17]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP19]] = trunc <4 x i64> [[TMP11]] to <4 x i32> ; CHECK-NEXT: br label [[BB]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -9,30 +9,32 @@ define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 ; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 ; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 ; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] -; CHECK-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 +; CHECK-NEXT: [[SIN0:%.*]] = call fast double @llvm.sin.f64(double [[A2]]) +; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) +; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) +; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) +; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A4]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[SIN3]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[SIN0]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[SIN2]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP12]] +; CHECK-NEXT: store <2 x double> [[TMP13]], ptr @dst, align 8 ; CHECK-NEXT: ret void ; %a0 = load double, ptr @src, align 8