Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2044,6 +2044,70 @@
   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
 }
 
+// Where SVE2 is enabled, we can combine an add of 1, add & shift right by 1
+// to a single s/urhadd instruction. Some extends can be folded into the
+// instruction and will be 'free', e.g.
+//    %zext1 = zext i8 %a to i16
+//    %zext2 = zext i8 %b to i16
+//    %add1 = add nuw nsw i16 %zext1, 1
+//    %add2 = add nuw nsw i16 %add1, %zext2
+//    %shr = lshr i16 %add2, 1
+//    %trunc = trunc i16 %shr to i8
+//
+bool isExtShiftRightAdd(const Instruction *I, const Instruction *Ext, Type *Dst,
+                        Type *Src) {
+  // Check that the cast is doubling the source type.
+  if ((Src->getScalarSizeInBits() != Dst->getScalarSizeInBits() / 2) ||
+      I->getOpcode() != Instruction::Add || !I->hasOneUser())
+    return false;
+
+  // Check for the add/shift/trunc pattern if I is an add of a constant.
+  auto Op1 = dyn_cast<ConstantInt>(I->getOperand(1));
+  if (!Op1) {
+    // Otherwise, get the other operand and look for the same pattern
+    // if this is an add.
+    auto *Op = I->getOperand(0) == Ext ? I->getOperand(1) : I->getOperand(0);
+
+    I = dyn_cast<Instruction>(Op);
+    if (!I || I->getOpcode() != Instruction::Add || !I->hasOneUser())
+      return false;
+
+    Op1 = dyn_cast<ConstantInt>(I->getOperand(1));
+  }
+
+  if (!Op1)
+    return false;
+
+  auto ExtVal = isa<ZExtInst>(Ext) ? Op1->getZExtValue() : Op1->getSExtValue();
+  if (ExtVal != 1)
+    return false;
+
+  // The add should only have one user, a right shift of 1.
+  auto *Add = cast<Instruction>(*I->user_begin());
+  if (Add->getOpcode() != Instruction::Add || !Add->hasOneUser())
+    return false;
+
+  auto *LShr = cast<Instruction>(*Add->user_begin());
+  if (LShr->getOpcode() != Instruction::LShr || !LShr->hasOneUser())
+    return false;
+
+  auto *LShrOp1 = dyn_cast<ConstantInt>(LShr->getOperand(1));
+  ExtVal = isa<ZExtInst>(Ext) ? LShrOp1->getZExtValue()
+                              : LShrOp1->getSExtValue();
+  if (!LShrOp1 || LShrOp1->getZExtValue() != 1)
+    return false;
+
+  // Ensure the only user of the shift is a trunc which is casting
+  // back to the original element type.
+  auto *Trunc = cast<Instruction>(*LShr->user_begin());
+  if (Trunc->getOpcode() != Instruction::Trunc ||
+      Src->getScalarSizeInBits() !=
+          cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
+    return false;
+
+  return true;
+}
+
 InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
                                                  Type *Src,
                                                  TTI::CastContextHint CCH,
@@ -2068,6 +2132,11 @@
       } else // Others are free so long as isWideningInstruction returned true.
         return 0;
     }
+
+    // The cast will be free for the SVE2 s/urhadd instructions
+    if (ST->hasSVE2() && (isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+        isExtShiftRightAdd(SingleUser, I, Dst, Src))
+      return 0;
   }
 
   // TODO: Allow non-throughput costs that aren't binary.
Index: llvm/test/Analysis/CostModel/AArch64/sve2-ext-rhadd.ll
===================================================================
--- /dev/null
+++ llvm/test/Analysis/CostModel/AArch64/sve2-ext-rhadd.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+; SRHADD
+
+define i8 @srhadd_i8_sext_i16(i8 %a, i8 %b, ptr %dst) {
+; CHECK-LABEL: 'srhadd_i8_sext_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext1 = sext i8 %a to i16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext2 = sext i8 %b to i16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add1 = add i16 %sext1, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add2 = add i16 %add1, %sext2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i16 %add2, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i16 %lsr to i8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %trunc
+;
+  %sext1 = sext i8 %a to i16
+  %sext2 = sext i8 %b to i16
+  %add1 = add i16 %sext1, 1
+  %add2 = add i16 %add1, %sext2
+  %lsr = lshr i16 %add2, 1
+  %trunc = trunc i16 %lsr to i8
+  ret i8 %trunc
+}
+
+define i16 @srhadd_i16_sext_i32(i16 %a, i16 %b, ptr %dst) {
+; CHECK-LABEL: 'srhadd_i16_sext_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext1 = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %sext2 = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add1 = add nuw nsw i32 %sext1, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add2 = add nuw nsw i32 %add1, %sext2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i32 %add2, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i32 %lsr to i16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %trunc
+;
+  %sext1 = sext i16 %a to i32
+  %sext2 = sext i16 %b to i32
+  %add1 = add nuw nsw i32 %sext1, 1
+  %add2 = add nuw nsw i32 %add1, %sext2
+  %lsr = lshr i32 %add2, 1
+  %trunc = trunc i32 %lsr to i16
+  ret i16 %trunc
+}
+
+; URHADD
+
+define i8 @urhadd_i8_zext_i16(i8 %a, i8 %b, ptr %dst) {
+; CHECK-LABEL: 'urhadd_i8_zext_i16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext1 = zext i8 %a to i16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext2 = zext i8 %b to i16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add1 = add i16 %zext1, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add2 = add i16 %add1, %zext2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i16 %add2, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i16 %lsr to i8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %trunc
+;
+  %zext1 = zext i8 %a to i16
+  %zext2 = zext i8 %b to i16
+  %add1 = add i16 %zext1, 1
+  %add2 = add i16 %add1, %zext2
+  %lsr = lshr i16 %add2, 1
+  %trunc = trunc i16 %lsr to i8
+  ret i8 %trunc
+}
+
+define i16 @urhadd_i16_zext_i32(i16 %a, i16 %b, ptr %dst) {
+; CHECK-LABEL: 'urhadd_i16_zext_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext1 = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %zext2 = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add1 = add nuw nsw i32 %zext1, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add2 = add nuw nsw i32 %add1, %zext2
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %lsr = lshr i32 %add2, 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i32 %lsr to i16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %trunc
+;
+  %zext1 = zext i16 %a to i32
+  %zext2 = zext i16 %b to i32
+  %add1 = add nuw nsw i32 %zext1, 1
+  %add2 = add nuw nsw i32 %add1, %zext2
+  %lsr = lshr i32 %add2, 1
+  %trunc = trunc i32 %lsr to i16
+  ret i16 %trunc
+}
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve2-ext-rhadd.ll
@@ -0,0 +1,129 @@
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve2 -sve-tail-folding=simple -S < %s | FileCheck %s
+
+; SRHADD
+
+define void @srhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @srhadd_i8_zext_i16(
+; CHECK: trunc <vscale x 16 x i16> {{.*}} to <vscale x 16 x i8>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %ld1 = load i8, ptr %arrayidx1
+  %sext1 = sext i8 %ld1 to i16
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %ld2 = load i8, ptr %arrayidx2
+  %sext2 = sext i8 %ld2 to i16
+  %add1 = add nuw nsw i16 %sext1, 1
+  %add2 = add nuw nsw i16 %add1, %sext2
+  %shr = lshr i16 %add2, 1
+  %trunc = trunc i16 %shr to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+define void @srhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @srhadd_i16_zext_i32(
+; CHECK: trunc <vscale x 8 x i32> {{.*}} to <vscale x 8 x i16>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %ld1 = load i16, ptr %arrayidx1
+  %sext1 = sext i16 %ld1 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %ld2 = load i16, ptr %arrayidx2
+  %sext2 = sext i16 %ld2 to i32
+  %add1 = add nuw nsw i32 %sext1, 1
+  %add2 = add nuw nsw i32 %add1, %sext2
+  %shr = lshr i32 %add2, 1
+  %trunc = trunc i32 %shr to i16
+  %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+  store i16 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; URHADD
+
+define void @urhadd_i8_zext_i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @urhadd_i8_zext_i16(
+; CHECK: trunc <vscale x 16 x i16> {{.*}} to <vscale x 16 x i8>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  %ld1 = load i8, ptr %arrayidx1
+  %zext1 = zext i8 %ld1 to i16
+  %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv
+  %ld2 = load i8, ptr %arrayidx2
+  %zext2 = zext i8 %ld2 to i16
+  %add1 = add nuw nsw i16 %zext1, 1
+  %add2 = add nuw nsw i16 %add1, %zext2
+  %shr = lshr i16 %add2, 1
+  %trunc = trunc i16 %shr to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv
+  store i8 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+define void @urhadd_i16_zext_i32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %dst, i64 %n) {
+; CHECK-LABEL: @urhadd_i16_zext_i32(
+; CHECK: trunc <vscale x 8 x i32> {{.*}} to <vscale x 8 x i16>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx1 = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %ld1 = load i16, ptr %arrayidx1
+  %zext1 = zext i16 %ld1 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %ld2 = load i16, ptr %arrayidx2
+  %zext2 = zext i16 %ld2 to i32
+  %add1 = add nuw nsw i32 %zext1, 1
+  %add2 = add nuw nsw i32 %add1, %zext2
+  %shr = lshr i32 %add2, 1
+  %trunc = trunc i32 %shr to i16
+  %arrayidx3 = getelementptr inbounds i16, ptr %dst, i64 %indvars.iv
+  store i16 %trunc, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}