Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2044,6 +2044,70 @@ return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +// Where SVE2 is enabled, we can combine an add of 1, add & shift right by 1 +// to a single s/urhadd instruction. Some extends can be folded into the +// instruction and will be 'free', e.g. +// %zext1 = zext i8 %a to i16 +// %zext2 = zext i8 %b to i16 +// %add1 = add nuw nsw i16 %zext1, 1 +// %add2 = add nuw nsw i16 %add1, %zext2 +// %shr = lshr i16 %add2, 1 +// %trunc = trunc i16 %shr to i8 +// +bool isExtShiftRightAdd(const Instruction *I, const Instruction *Ext, Type *Dst, + Type *Src) { + // Check that the cast is doubling the source type. + if ((Src->getScalarSizeInBits() != Dst->getScalarSizeInBits() / 2) || + I->getOpcode() != Instruction::Add || !I->hasOneUser()) + return false; + + // Check for the add/shift/trunc pattern if I is an add of a constant. + auto Op1 = dyn_cast(I->getOperand(1)); + if (!Op1) { + // Otherwise, get the other operand and look for the same pattern + // if this is an add. + auto *Op = I->getOperand(0) == Ext ? I->getOperand(1) : I->getOperand(0); + + I = dyn_cast(Op); + if (!I || I->getOpcode() != Instruction::Add || !I->hasOneUser()) + return false; + + Op1 = dyn_cast(I->getOperand(1)); + } + + if (!Op1) + return false; + + auto ExtVal = isa(Ext) ? Op1->getZExtValue() : Op1->getSExtValue(); + if (ExtVal != 1) + return false; + + // The add should only have one user, a right shift of 1. + auto *Add = cast(*I->user_begin()); + if (Add->getOpcode() != Instruction::Add || !Add->hasOneUser()) + return false; + + auto *LShr = cast(*Add->user_begin()); + if (LShr->getOpcode() != Instruction::LShr || !LShr->hasOneUser()) + return false; + + auto *LShrOp1 = dyn_cast(LShr->getOperand(1)); + ExtVal = isa(Ext) ? LShrOp1->getZExtValue() + : LShrOp1->getSExtValue(); + if (!LShrOp1 || LShrOp1->getZExtValue() != 1) + return false; + + // Ensure the only user of the shift is a trunc which is casting + // back to the original element type. + auto *Trunc = cast(*LShr->user_begin()); + if (Trunc->getOpcode() != Instruction::Trunc || + Src->getScalarSizeInBits() != + cast(Trunc)->getDestTy()->getScalarSizeInBits()) + return false; + + return true; +} + InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, @@ -2068,6 +2132,11 @@ } else // Others are free so long as isWideningInstruction returned true. return 0; } + + // The cast will be free for the SVE2 s/urhadd instructions + if (ST->hasSVE2() && (isa(I) || isa(I)) && + isExtShiftRightAdd(SingleUser, I, Dst, Src)) + return 0; } // TODO: Allow non-throughput costs that aren't binary. Index: llvm/test/Analysis/CostModel/AArch64/sve2-ext-rhadd.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/AArch64/sve2-ext-rhadd.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; SRHADD + +define i8 @srhadd_i8_sext_i16(i8 %a, i8 %b, ptr %dst) { +; CHECK-LABEL: 'srhadd_i8_sext_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext1 = sext i8 %a to i16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext2 = sext i8 %b to i16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add1 = add i16 %sext1, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add2 = add i16 %add1, %sext2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i16 %add2, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i16 %lsr to i8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %trunc +; + %sext1 = sext i8 %a to i16 + %sext2 = sext i8 %b to i16 + %add1 = add i16 %sext1, 1 + %add2 = add i16 %add1, %sext2 + %lsr = lshr i16 %add2, 1 + %trunc = trunc i16 %lsr to i8 + ret i8 %trunc +} + +define i16 @srhadd_i16_sext_i32(i16 %a, i16 %b, ptr %dst) { +; CHECK-LABEL: 'srhadd_i16_sext_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext1 = sext i16 %a to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext2 = sext i16 %b to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add1 = add nuw nsw i32 %sext1, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add2 = add nuw nsw i32 %add1, %sext2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i32 %add2, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i32 %lsr to i16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %trunc +; + %sext1 = sext i16 %a to i32 + %sext2 = sext i16 %b to i32 + %add1 = add nuw nsw i32 %sext1, 1 + %add2 = add nuw nsw i32 %add1, %sext2 + %lsr = lshr i32 %add2, 1 + %trunc = trunc i32 %lsr to i16 + ret i16 %trunc +} + +; URHADD + +define i8 @urhadd_i8_zext_i16(i8 %a, i8 %b, ptr %dst) { +; CHECK-LABEL: 'urhadd_i8_zext_i16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext1 = zext i8 %a to i16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext2 = zext i8 %b to i16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add1 = add i16 %zext1, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add2 = add i16 %add1, %zext2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i16 %add2, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i16 %lsr to i8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %trunc +; + %zext1 = zext i8 %a to i16 + %zext2 = zext i8 %b to i16 + %add1 = add i16 %zext1, 1 + %add2 = add i16 %add1, %zext2 + %lsr = lshr i16 %add2, 1 + %trunc = trunc i16 %lsr to i8 + ret i8 %trunc +} + +define i16 @urhadd_i16_zext_i32(i16 %a, i16 %b, ptr %dst) { +; CHECK-LABEL: 'urhadd_i16_zext_i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext1 = zext i16 %a to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext2 = zext i16 %b to i32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add1 = add nuw nsw i32 %zext1, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add2 = add nuw nsw i32 %add1, %zext2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i32 %add2, 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i32 %lsr to i16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %trunc +; + %zext1 = zext i16 %a to i32 + %zext2 = zext i16 %b to i32 + %add1 = add nuw nsw i32 %zext1, 1 + %add2 = add nuw nsw i32 %add1, %zext2 + %lsr = lshr i32 %add2, 1 + %trunc = trunc i32 %lsr to i16 + ret i16 %trunc +} Index: llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll @@ -0,0 +1,129 @@ +; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve2 -sve-tail-folding=simple -S < %s | FileCheck %s + +; SRHADD + +define void @srhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) { +; CHECK-LABEL: @srhadd_i8_zext_i16( +; CHECK: trunc {{.*}} to +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + %ld1 = load i8, ptr %arrayidx1 + %sext1 = sext i8 %ld1 to i16 + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %ld2 = load i8, ptr %arrayidx2 + %sext2 = sext i8 %ld2 to i16 + %add1 = add nuw nsw i16 %sext1, 1 + %add2 = add nuw nsw i16 %add1, %sext2 + %shr = lshr i16 %add2, 1 + %trunc = trunc i16 %shr to i8 + %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv + store i8 %trunc, ptr %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +define void @srhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) { +; CHECK-LABEL: @srhadd_i16_zext_i32( +; CHECK: trunc {{.*}} to +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv + %ld1 = load i16, ptr %arrayidx1 + %sext1 = sext i16 %ld1 to i32 + %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv + %ld2 = load i16, ptr %arrayidx2 + %sext2 = sext i16 %ld2 to i32 + %add1 = add nuw nsw i32 %sext1, 1 + %add2 = add nuw nsw i32 %add1, %sext2 + %shr = lshr i32 %add2, 1 + %trunc = trunc i32 %shr to i16 + %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv + store i16 %trunc, ptr %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; URHADD + +define void @urhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) { +; CHECK-LABEL: @urhadd_i8_zext_i16( +; CHECK: trunc {{.*}} to +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + %ld1 = load i8, ptr %arrayidx1 + %zext1 = zext i8 %ld1 to i16 + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %ld2 = load i8, ptr %arrayidx2 + %zext2 = zext i8 %ld2 to i16 + %add1 = add nuw nsw i16 %zext1, 1 + %add2 = add nuw nsw i16 %add1, %zext2 + %shr = lshr i16 %add2, 1 + %trunc = trunc i16 %shr to i8 + %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv + store i8 %trunc, ptr %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +define void @urhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) { +; CHECK-LABEL: @urhadd_i16_zext_i32( +; CHECK: trunc {{.*}} to +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv + %ld1 = load i16, ptr %arrayidx1 + %zext1 = zext i16 %ld1 to i32 + %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv + %ld2 = load i16, ptr %arrayidx2 + %zext2 = zext i16 %ld2 to i32 + %add1 = add nuw nsw i32 %zext1, 1 + %add2 = add nuw nsw i32 %add1, %zext2 + %shr = lshr i32 %add2, 1 + %trunc = trunc i32 %shr to i16 + %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv + store i16 %trunc, ptr %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}