diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -163,6 +163,9 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); + bool isExtPartOfAvgExpr(const Instruction *ExtUser, const CastInst *Ext, + Type *Dst, Type *Src); + InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2044,6 +2044,56 @@ return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +// s/urhadd instructions implement the following pattern, making the +// extends free: +// %x = add ((zext i8 -> i16), 1) +// %y = (zext i8 -> i16) +// trunc i16 (lshr (add %x, %y), 1) -> i8 +// +bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, + const CastInst *Ext, Type *Dst, + Type *Src) { + + // The source should be a legal vector type. + if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) || + (Src->isScalableTy() && !ST->hasSVE2())) + return false; + + if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) + return false; + + // Look for trunc/shl/add before trying to match the pattern. + const Instruction *Add = ExtUser; + auto *AddUser = + dyn_cast_or_null(Add->getUniqueUndroppableUser()); + if (AddUser && AddUser->getOpcode() == Instruction::Add) + Add = AddUser; + + auto *Shr = dyn_cast_or_null(Add->getUniqueUndroppableUser()); + if (!Shr || Shr->getOpcode() != Instruction::LShr) + return false; + + auto *Trunc = dyn_cast_or_null(Shr->getUniqueUndroppableUser()); + if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || + Src->getScalarSizeInBits() != + cast(Trunc)->getDestTy()->getScalarSizeInBits()) + return false; + + // Try to match the whole pattern. Ext could be either the first or second + // m_ZExtOrSExt matched. + Instruction *Ex1, *Ex2; + if (!(match(Add, m_c_Add(m_Instruction(Ex1), + m_c_Add(m_Instruction(Ex2), m_SpecificInt(1)))))) + return false; + + // Ensure both extends are of the same type + if (match(Ex1, m_ZExtOrSExt(m_Value())) && + Ex1->getOpcode() == Ex2->getOpcode()) + return true; + + return false; +} + InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, @@ -2068,6 +2118,11 @@ } else // Others are free so long as isWideningInstruction returned true. return 0; } + + // The cast will be free for the s/urhadd instructions + if ((isa(I) || isa(I)) && + isExtPartOfAvgExpr(SingleUser, cast(I), Dst, Src)) + return 0; } // TODO: Allow non-throughput costs that aren't binary. diff --git a/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll @@ -0,0 +1,201 @@ +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s -check-prefix=SVE +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefix=SVE2 + +; SRHADD + +define void @srhadd_i8_sext_i16_fixed(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'srhadd_i8_sext_i16_fixed' +; SVE: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <16 x i8> %ld1 to <16 x i16> +; SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <16 x i8> %ld2 to <16 x i16> +; +; SVE2-LABEL: 'srhadd_i8_sext_i16_fixed' +; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext <16 x i8> %ld1 to <16 x i16> +; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext <16 x i8> %ld2 to <16 x i16> +; + %ld1 = load <16 x i8>, ptr %a + %ld2 = load <16 x i8>, ptr %b + %ext1 = sext <16 x i8> %ld1 to <16 x i16> + %ext2 = sext <16 x i8> %ld2 to <16 x i16> + %add1 = add nuw nsw <16 x i16> %ext1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 1, i64 0), <16 x i16> poison, <16 x i32> zeroinitializer) + %add2 = add nuw nsw <16 x i16> %add1, %ext2 + %shr = lshr <16 x i16> %add2, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 1, i64 0), <16 x i16> poison, <16 x i32> zeroinitializer) + %trunc = trunc <16 x i16> %shr to <16 x i8> + store <16 x i8> %trunc, ptr %a + ret void +} + +define void @srhadd_i8_sext_i16_scalable(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'srhadd_i8_sext_i16_scalable' +; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext %ld2 to +; +; SVE2-LABEL: 'srhadd_i8_sext_i16_scalable' +; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = sext %ld1 to + %ext2 = sext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + store %trunc, ptr %a + ret void +} + +define void @srhadd_i16_sext_i64_scalable(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'srhadd_i16_sext_i64_scalable' +; SVE: Cost Model: Found an estimated cost of 6 for instruction: %ext1 = sext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ext2 = sext %ld2 to +; +; SVE2-LABEL: 'srhadd_i16_sext_i64_scalable' +; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = sext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = sext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = sext %ld1 to + %ext2 = sext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + store %trunc, ptr %a + ret void +} + +; URHADD + +define void @urhadd_i32_zext_i64_fixed(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'urhadd_i32_zext_i64_fixed' +; SVE: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <4 x i32> %ld1 to <4 x i64> +; SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <4 x i32> %ld2 to <4 x i64> +; +; SVE2-LABEL: 'urhadd_i32_zext_i64_fixed' +; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext <4 x i32> %ld1 to <4 x i64> +; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext <4 x i32> %ld2 to <4 x i64> +; + %ld1 = load <4 x i32>, ptr %a + %ld2 = load <4 x i32>, ptr %b + %ext1 = zext <4 x i32> %ld1 to <4 x i64> + %ext2 = zext <4 x i32> %ld2 to <4 x i64> + %add1 = add nuw nsw <4 x i64> %ext1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 1, i64 0), <4 x i64> poison, <4 x i32> zeroinitializer) + %add2 = add nuw nsw <4 x i64> %add1, %ext2 + %shr = lshr <4 x i64> %add2, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 1, i64 0), <4 x i64> poison, <4 x i32> zeroinitializer) + %trunc = trunc <4 x i64> %shr to <4 x i32> + store <4 x i32> %trunc, ptr %a + ret void +} + +define void @urhadd_i8_zext_i64(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'urhadd_i8_zext_i64' +; SVE: Cost Model: Found an estimated cost of 14 for instruction: %ext1 = zext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %ext2 = zext %ld2 to +; +; SVE2-LABEL: 'urhadd_i8_zext_i64' +; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = zext %ld1 to + %ext2 = zext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + store %trunc, ptr %a + ret void +} + +define void @urhadd_i16_zext_i32(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'urhadd_i16_zext_i32' +; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext %ld2 to +; +; SVE2-LABEL: 'urhadd_i16_zext_i32' +; SVE2: Cost Model: Found an estimated cost of 0 for instruction: %ext1 = zext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %ext2 = zext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = zext %ld1 to + %ext2 = zext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + store %trunc, ptr %a + ret void +} + +; NEGATIVE TESTS + +define void @ext_operand_mismatch(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'ext_operand_mismatch' +; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext %ld2 to +; +; SVE2-LABEL: 'ext_operand_mismatch' +; SVE2: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = sext %ld1 to + %ext2 = zext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + store %trunc, ptr %a + ret void +} + +define void @add_multiple_uses(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'add_multiple_uses' +; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext %ld2 to +; +; SVE2-LABEL: 'add_multiple_uses' +; SVE2: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = sext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = sext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = sext %ld1 to + %ext2 = sext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + %add.res = add nuw nsw %add1, %add2 + %res = trunc %add.res to + store %res, ptr %a + ret void +} + +define void @shift_multiple_uses(ptr %a, ptr %b, ptr %dst) { +; SVE-LABEL: 'shift_multiple_uses' +; SVE: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext %ld1 to +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext %ld2 to +; +; SVE2-LABEL: 'shift_multiple_uses' +; SVE2: Cost Model: Found an estimated cost of 2 for instruction: %ext1 = zext %ld1 to +; SVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ext2 = zext %ld2 to +; + %ld1 = load , ptr %a + %ld2 = load , ptr %b + %ext1 = zext %ld1 to + %ext2 = zext %ld2 to + %add1 = add nuw nsw %ext1, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) + %add2 = add nuw nsw %add1, %ext2 + %shr = lshr %add2, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) + %trunc = trunc %shr to + %add3 = add nuw nsw %shr, %add2 + %res = trunc %add3 to + store %res, ptr %a + ret void +}