Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -106,6 +106,7 @@ unsigned PrefFunctionAlignment = 0; unsigned PrefLoopAlignment = 0; unsigned MaxJumpTableSize = 0; + unsigned WideningBaseCost = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -228,6 +229,8 @@ unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } + unsigned getWideningBaseCost() const { return WideningBaseCost; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -43,6 +43,9 @@ VECTOR_LDST_FOUR_ELEMENTS }; + bool isWideningInstruction(Type *Ty, unsigned Opcode, + ArrayRef Args); + public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -176,11 +176,95 @@ return TTI::PSK_Software; } +bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, + ArrayRef Args) { + + // A helper that returns a vector type from the given type. The number of + // elements in type Ty determine the vector width. + auto toVectorTy = [&](Type *ArgTy) { + return VectorType::get(ArgTy->getScalarType(), + DstTy->getVectorNumElements()); + }; + + // Exit early if DstTy is not a vector type whose elements are at least + // 16-bits wide. + if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) + return false; + + // Determine if the operation has a widening variant. We consider both the + // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the + // instructions. + // + // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we + // verify that their extending operands are eliminated during code + // generation. + switch (Opcode) { + case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). + case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + break; + default: + return false; + } + + // To be a widening instruction (either the "wide" or "long" versions), the + // second operand must be a sign- or zero extend having a single user. We + // only consider extends having a single user because they may otherwise not + // be eliminated. + if (Args.size() != 2 || + (!isa(Args[1]) && !isa(Args[1])) || + !Args[1]->hasOneUse()) + return false; + auto *Extend = cast(Args[1]); + + // Legalize the destination type and ensure it can be used in a widening + // operation. + auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); + unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); + if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) + return false; + + // Legalize the source type and ensure it can be used in a widening + // operation. + Type *SrcTy = toVectorTy(Extend->getSrcTy()); + auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); + unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); + if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) + return false; + + // Get the total number of vector elements in the legalized types. + unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements(); + unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements(); + + // Return true if the legalized types have the same number of vector elements + // and the destination element type size is twice that of the source type. + return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; +} + int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // If the cast is observable, and it is used by a widening instruction (e.g., + // uaddl, saddw, etc.), it may be free. + if (I && I->hasOneUse()) { + auto *SingleUser = cast(*I->user_begin()); + SmallVector Operands(SingleUser->operand_values()); + if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { + // If the cast is the second operand, it is free. We will generate either + // a "wide" or "long" version of the widening instruction. + if (I == SingleUser->getOperand(1)) + return 0; + // If the cast is not the second operand, it will be free if it looks the + // same as the second operand. In this case, we will generate a "long" + // version of the widening instruction. + if (auto *Cast = dyn_cast(SingleUser->getOperand(1))) + if (I->getOpcode() == Cast->getOpcode() && + cast(I)->getSrcTy() == Cast->getSrcTy()) + return 0; + } + } + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); @@ -379,6 +463,16 @@ // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), + // add in the widening overhead specified by the sub-target. Since the + // extends feeding widening instructions are performed automatically, they + // aren't present in the generated code and have a zero cost. By adding a + // widening overhead here, we attach the total cost of the combined operation + // to the widening instruction. + int Cost = 0; + if (isWideningInstruction(Ty, Opcode, Args)) + Cost += ST->getWideningBaseCost(); + int ISD = TLI->InstructionOpcodeToISD(Opcode); if (ISD == ISD::SDIV && @@ -388,9 +482,9 @@ // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -405,8 +499,8 @@ switch (ISD) { default: - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -414,7 +508,7 @@ case ISD::AND: // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. - return 1 * LT.first; + return (Cost + 1) * LT.first; } } Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1835,11 +1835,13 @@ CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); + SmallVector Operands(VL0->operand_values()); + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); + Op1VP, Op2VP, Operands); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { Index: test/Analysis/CostModel/AArch64/free-widening-casts.ll =================================================================== --- /dev/null +++ test/Analysis/CostModel/AArch64/free-widening-casts.ll @@ -0,0 +1,622 @@ +; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s --check-prefix=COST +; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE + +; COST-LABEL: uaddl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +; CODE-LABEL: uaddl_8h +; CODE: uaddl v0.8h, v0.8b, v1.8b +define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: uaddl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +; CODE-LABEL: uaddl_4s +; CODE: uaddl v0.4s, v0.4h, v1.4h +define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: uaddl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64> +; CODE-LABEL: uaddl_2d +; CODE: uaddl v0.2d, v0.2s, v1.2s +define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = zext <2 x i32> %b to <2 x i64> + %tmp2 = add <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: uaddl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16> +; CODE-LABEL: uaddl2_8h +; CODE: uaddl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: uaddl v0.8h, v0.8b, v1.8b +define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = zext <16 x i8> %b to <16 x i16> + %tmp2 = add <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: uaddl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32> +; CODE-LABEL: uaddl2_4s +; CODE: uaddl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: uaddl v0.4s, v0.4h, v1.4h +define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = zext <8 x i16> %b to <8 x i32> + %tmp2 = add <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: uaddl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64> +; CODE-LABEL: uaddl2_2d +; CODE: uaddl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: uaddl v0.2d, v0.2s, v1.2s +define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = zext <4 x i32> %b to <4 x i64> + %tmp2 = add <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: saddl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16> +; CODE-LABEL: saddl_8h +; CODE: saddl v0.8h, v0.8b, v1.8b +define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: saddl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32> +; CODE-LABEL: saddl_4s +; CODE: saddl v0.4s, v0.4h, v1.4h +define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: saddl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64> +; CODE-LABEL: saddl_2d +; CODE: saddl v0.2d, v0.2s, v1.2s +define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sext <2 x i32> %b to <2 x i64> + %tmp2 = add <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: saddl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16> +; CODE-LABEL: saddl2_8h +; CODE: saddl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: saddl v0.8h, v0.8b, v1.8b +define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sext <16 x i8> %b to <16 x i16> + %tmp2 = add <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: saddl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32> +; CODE-LABEL: saddl2_4s +; CODE: saddl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: saddl v0.4s, v0.4h, v1.4h +define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sext <8 x i16> %b to <8 x i32> + %tmp2 = add <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: saddl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64> +; CODE-LABEL: saddl2_2d +; CODE: saddl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: saddl v0.2d, v0.2s, v1.2s +define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sext <4 x i32> %b to <4 x i64> + %tmp2 = add <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: usubl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +; CODE-LABEL: usubl_8h +; CODE: usubl v0.8h, v0.8b, v1.8b +define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = sub <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: usubl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +; CODE-LABEL: usubl_4s +; CODE: usubl v0.4s, v0.4h, v1.4h +define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = sub <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: usubl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64> +; CODE-LABEL: usubl_2d +; CODE: usubl v0.2d, v0.2s, v1.2s +define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = zext <2 x i32> %b to <2 x i64> + %tmp2 = sub <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: usubl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16> +; CODE-LABEL: usubl2_8h +; CODE: usubl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: usubl v0.8h, v0.8b, v1.8b +define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = zext <16 x i8> %b to <16 x i16> + %tmp2 = sub <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: usubl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32> +; CODE-LABEL: usubl2_4s +; CODE: usubl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: usubl v0.4s, v0.4h, v1.4h +define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = zext <8 x i16> %b to <8 x i32> + %tmp2 = sub <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: usubl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64> +; CODE-LABEL: usubl2_2d +; CODE: usubl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: usubl v0.2d, v0.2s, v1.2s +define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = zext <4 x i32> %b to <4 x i64> + %tmp2 = sub <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: ssubl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16> +; CODE-LABEL: ssubl_8h +; CODE: ssubl v0.8h, v0.8b, v1.8b +define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sext <8 x i8> %b to <8 x i16> + %tmp2 = sub <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: ssubl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32> +; CODE-LABEL: ssubl_4s +; CODE: ssubl v0.4s, v0.4h, v1.4h +define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sext <4 x i16> %b to <4 x i32> + %tmp2 = sub <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: ssubl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64> +; CODE-LABEL: ssubl_2d +; CODE: ssubl v0.2d, v0.2s, v1.2s +define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sext <2 x i32> %b to <2 x i64> + %tmp2 = sub <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: ssubl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16> +; CODE-LABEL: ssubl2_8h +; CODE: ssubl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: ssubl v0.8h, v0.8b, v1.8b +define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sext <16 x i8> %b to <16 x i16> + %tmp2 = sub <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: ssubl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32> +; CODE-LABEL: ssubl2_4s +; CODE: ssubl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: ssubl v0.4s, v0.4h, v1.4h +define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sext <8 x i16> %b to <8 x i32> + %tmp2 = sub <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: ssubl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64> +; CODE-LABEL: ssubl2_2d +; CODE: ssubl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: ssubl v0.2d, v0.2s, v1.2s +define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sext <4 x i32> %b to <4 x i64> + %tmp2 = sub <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: uaddw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; CODE-LABEL: uaddw_8h +; CODE: uaddw v0.8h, v1.8h, v0.8b +define <8 x i16> @uaddw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = add <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: uaddw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; CODE-LABEL: uaddw_4s +; CODE: uaddw v0.4s, v1.4s, v0.4h +define <4 x i32> @uaddw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = add <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: uaddw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; CODE-LABEL: uaddw_2d +; CODE: uaddw v0.2d, v1.2d, v0.2s +define <2 x i64> @uaddw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = add <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: uaddw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; CODE-LABEL: uaddw2_8h +; CODE: uaddw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: uaddw v0.8h, v1.8h, v0.8b +define <16 x i16> @uaddw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = add <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: uaddw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; CODE-LABEL: uaddw2_4s +; CODE: uaddw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: uaddw v0.4s, v1.4s, v0.4h +define <8 x i32> @uaddw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = add <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: uaddw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; CODE-LABEL: uaddw2_2d +; CODE: uaddw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: uaddw v0.2d, v1.2d, v0.2s +define <4 x i64> @uaddw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = add <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: saddw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; CODE-LABEL: saddw_8h +; CODE: saddw v0.8h, v1.8h, v0.8b +define <8 x i16> @saddw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = add <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: saddw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; CODE-LABEL: saddw_4s +; CODE: saddw v0.4s, v1.4s, v0.4h +define <4 x i32> @saddw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = add <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: saddw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; CODE-LABEL: saddw_2d +; CODE: saddw v0.2d, v1.2d, v0.2s +define <2 x i64> @saddw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = add <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: saddw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; CODE-LABEL: saddw2_8h +; CODE: saddw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: saddw v0.8h, v1.8h, v0.8b +define <16 x i16> @saddw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = add <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: saddw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; CODE-LABEL: saddw2_4s +; CODE: saddw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: saddw v0.4s, v1.4s, v0.4h +define <8 x i32> @saddw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = add <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: saddw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; CODE-LABEL: saddw2_2d +; CODE: saddw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: saddw v0.2d, v1.2d, v0.2s +define <4 x i64> @saddw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = add <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: usubw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; CODE-LABEL: usubw_8h +; CODE: usubw v0.8h, v1.8h, v0.8b +define <8 x i16> @usubw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = sub <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: usubw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; CODE-LABEL: usubw_4s +; CODE: usubw v0.4s, v1.4s, v0.4h +define <4 x i32> @usubw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = sub <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: usubw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; CODE-LABEL: usubw_2d +; CODE: usubw v0.2d, v1.2d, v0.2s +define <2 x i64> @usubw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = sub <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: usubw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; CODE-LABEL: usubw2_8h +; CODE: usubw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: usubw v0.8h, v1.8h, v0.8b +define <16 x i16> @usubw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = sub <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: usubw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; CODE-LABEL: usubw2_4s +; CODE: usubw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: usubw v0.4s, v1.4s, v0.4h +define <8 x i32> @usubw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = sub <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: usubw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; CODE-LABEL: usubw2_2d +; CODE: usubw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: usubw v0.2d, v1.2d, v0.2s +define <4 x i64> @usubw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = sub <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: ssubw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; CODE-LABEL: ssubw_8h +; CODE: ssubw v0.8h, v1.8h, v0.8b +define <8 x i16> @ssubw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sub <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: ssubw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; CODE-LABEL: ssubw_4s +; CODE: ssubw v0.4s, v1.4s, v0.4h +define <4 x i32> @ssubw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sub <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: ssubw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; CODE-LABEL: ssubw_2d +; CODE: ssubw v0.2d, v1.2d, v0.2s +define <2 x i64> @ssubw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sub <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: ssubw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; CODE-LABEL: ssubw2_8h +; CODE: ssubw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: ssubw v0.8h, v1.8h, v0.8b +define <16 x i16> @ssubw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sub <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: ssubw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; CODE-LABEL: ssubw2_4s +; CODE: ssubw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: ssubw v0.4s, v1.4s, v0.4h +define <8 x i32> @ssubw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sub <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: ssubw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; CODE-LABEL: ssubw2_2d +; CODE: ssubw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: ssubw v0.2d, v1.2d, v0.2s +define <4 x i64> @ssubw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sub <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: neg_wrong_operand_order +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +define <8 x i16> @neg_wrong_operand_order(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = sub <8 x i16> %tmp0, %b + ret <8 x i16> %tmp1 +} + +; COST-LABEL: neg_non_widening_op +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +define <8 x i16> @neg_non_widening_op(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = udiv <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: neg_dissimilar_operand_kind_0 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +define <8 x i16> @neg_dissimilar_operand_kind_0(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: neg_dissimilar_operand_kind_1 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <4 x i8> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +define <4 x i32> @neg_dissimilar_operand_kind_1(<4 x i8> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i8> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: neg_illegal_vector_type_0 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <16 x i4> %a to <16 x i8> +define <16 x i8> @neg_illegal_vector_type_0(<16 x i4> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i4> %a to <16 x i8> + %tmp1 = sub <16 x i8> %b, %tmp0 + ret <16 x i8> %tmp1 +} + +; COST-LABEL: neg_llegal_vector_type_1 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <1 x i16> %a to <1 x i32> +define <1 x i32> @neg_llegal_vector_type_1(<1 x i16> %a, <1 x i32> %b) { + %tmp0 = zext <1 x i16> %a to <1 x i32> + %tmp1 = add <1 x i32> %b, %tmp0 + ret <1 x i32> %tmp1 +} + +; COST-LABEL: neg_llegal_vector_type_2 +; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i64> +define <4 x i64> @neg_llegal_vector_type_2(<4 x i16> %a, <4 x i64> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i64> + %tmp1 = add <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: neg_llegal_vector_type_3 +; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68> +define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) { + %tmp0 = zext <3 x i34> %a to <3 x i68> + %tmp1 = add <3 x i68> %b, %tmp0 + ret <3 x i68> %tmp1 +}