Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -43,6 +43,8 @@
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
+  bool isLengthening(Type *Dst, Type *Src, const Instruction *I);
+
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,11 +176,85 @@
   return TTI::PSK_Software;
 }
 
+bool AArch64TTIImpl::isLengthening(Type *Dst, Type *Src, const Instruction *I) {
+
+  // A helper that returns true if the given instruction is a sign- or
+  // zero-extend.
+  auto isExtend = [](const Instruction *I) {
+    return isa<SExtInst>(I) || isa<ZExtInst>(I);
+  };
+
+  // Exit early if the cast is not an extension, has more that one use, or is
+  // not a vector cast.
+  if (!isExtend(I) || !I->hasOneUse() || !Dst->isVectorTy())
+    return false;
+
+  // Determine if the single user of the cast is an operation having a
+  // lengthening variant.
+  //
+  // TODO: Add additional lengthening operations (e.g., mul, shl, etc.) once we
+  //       verify that the extends are eliminated during code generation.
+  auto *SingleUser = cast<Instruction>(*I->user_begin());
+  switch (SingleUser->getOpcode()) {
+  case Instruction::Add: // UADDL(2), SADDL(2)
+  case Instruction::Sub: // USUBL(2), SSUBL(2)
+    break;
+  default:
+    return false;
+  }
+
+  // Get the source scalar type. "SrcIRTy" can differ from "Src" since it's
+  // associated with an observable IR instruction, whereas "Src" may not be.
+  // For example, we may be computing the cost of vectorizing the instruction.
+  // In this case, "SrcIRTy" would be the original scalar type and "Src" would
+  // be the vector type for which we're computing the cost.
+  auto *SrcIRTy = cast<CastInst>(I)->getSrcTy();
+
+  // Lengthening instructions operate on doubleword operands and produce
+  // quadword results. Thus, both operands must be an extension of the same
+  // kind.
+  for (const Use &U : SingleUser->operands()) {
+    auto *Cast = dyn_cast<CastInst>(U.get());
+    if (!Cast || !isExtend(Cast) || (!I && Cast->getSrcTy() != SrcIRTy))
+      return false;
+  }
+
+  // Legalize the destination type and ensure it can be used in a lengthening
+  // operation (i.e., it must be 8h, 4s or 2d).
+  auto DstTy = TLI->getTypeLegalizationCost(DL, Dst);
+  unsigned DstElTySize = DstTy.second.getScalarSizeInBits();
+  if (!DstTy.second.is128BitVector() || DstElTySize < 16)
+    return false;
+
+  // Get the total number of vector elements in the legalized destination type.
+  unsigned NumDstEls = DstTy.first * DstTy.second.getVectorNumElements();
+
+  // Legalize the source type and ensure it is a vector.
+  auto SrcTy = TLI->getTypeLegalizationCost(DL, Src);
+  if (!SrcTy.second.isVector())
+    return false;
+
+  // Get the source vector element size and total number of elements in the
+  // legalized source type.
+  unsigned SrcElTySize = SrcTy.second.getScalarSizeInBits();
+  unsigned NumSrcEls = SrcTy.first * SrcTy.second.getVectorNumElements();
+
+  // Return true if the legalized source and destination types have the same
+  // number of vector elements and the destination element type size is twice
+  // that of the source type.
+  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // If the cast is observable, and it's used by an instruction having a
+  // lengthening variant (e.g., uaddl, saddl, etc.), it may be free.
+  if (I && isLengthening(Dst, Src, I))
+    return 0;
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
Index: test/Analysis/CostModel/AArch64/lengthening-casts.ll
===================================================================
--- /dev/null
+++ test/Analysis/CostModel/AArch64/lengthening-casts.ll
@@ -0,0 +1,244 @@
+; RUN: opt < %s -cost-model -analyze | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: uaddl_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; CHECK-LABEL: uaddl_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; CHECK-LABEL: uaddl_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = zext <2 x i32> %b to <2 x i64>
+  %tmp2 = add <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; CHECK-LABEL: uaddl2_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = zext <16 x i8> %b to <16 x i16>
+  %tmp2 = add <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; CHECK-LABEL: uaddl2_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = zext <8 x i16> %b to <8 x i32>
+  %tmp2 = add <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; CHECK-LABEL: uaddl2_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = zext <4 x i32> %b to <4 x i64>
+  %tmp2 = add <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; CHECK-LABEL: saddl_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sext <8 x i8> %b to <8 x i16>
+  %tmp2 = add <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; CHECK-LABEL: saddl_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sext <4 x i16> %b to <4 x i32>
+  %tmp2 = add <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; CHECK-LABEL: saddl_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sext <2 x i32> %b to <2 x i64>
+  %tmp2 = add <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; CHECK-LABEL: saddl2_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sext <16 x i8> %b to <16 x i16>
+  %tmp2 = add <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; CHECK-LABEL: saddl2_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sext <8 x i16> %b to <8 x i32>
+  %tmp2 = add <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; CHECK-LABEL: saddl2_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sext <4 x i32> %b to <4 x i64>
+  %tmp2 = add <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; CHECK-LABEL: usubl_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16>
+define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = zext <8 x i8> %a to <8 x i16>
+  %tmp1 = zext <8 x i8> %b to <8 x i16>
+  %tmp2 = sub <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; CHECK-LABEL: usubl_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32>
+define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = zext <4 x i16> %a to <4 x i32>
+  %tmp1 = zext <4 x i16> %b to <4 x i32>
+  %tmp2 = sub <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; CHECK-LABEL: usubl_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64>
+define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = zext <2 x i32> %a to <2 x i64>
+  %tmp1 = zext <2 x i32> %b to <2 x i64>
+  %tmp2 = sub <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; CHECK-LABEL: usubl2_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16>
+define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = zext <16 x i8> %a to <16 x i16>
+  %tmp1 = zext <16 x i8> %b to <16 x i16>
+  %tmp2 = sub <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; CHECK-LABEL: usubl2_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32>
+define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = zext <8 x i16> %a to <8 x i32>
+  %tmp1 = zext <8 x i16> %b to <8 x i32>
+  %tmp2 = sub <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; CHECK-LABEL: usubl2_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64>
+define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = zext <4 x i32> %a to <4 x i64>
+  %tmp1 = zext <4 x i32> %b to <4 x i64>
+  %tmp2 = sub <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}
+
+; CHECK-LABEL: ssubl_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16>
+define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) {
+  %tmp0 = sext <8 x i8> %a to <8 x i16>
+  %tmp1 = sext <8 x i8> %b to <8 x i16>
+  %tmp2 = sub <8 x i16> %tmp0, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+; CHECK-LABEL: ssubl_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32>
+define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) {
+  %tmp0 = sext <4 x i16> %a to <4 x i32>
+  %tmp1 = sext <4 x i16> %b to <4 x i32>
+  %tmp2 = sub <4 x i32> %tmp0, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+; CHECK-LABEL: ssubl_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64>
+define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) {
+  %tmp0 = sext <2 x i32> %a to <2 x i64>
+  %tmp1 = sext <2 x i32> %b to <2 x i64>
+  %tmp2 = sub <2 x i64> %tmp0, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+; CHECK-LABEL: ssubl2_8h
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16>
+define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) {
+  %tmp0 = sext <16 x i8> %a to <16 x i16>
+  %tmp1 = sext <16 x i8> %b to <16 x i16>
+  %tmp2 = sub <16 x i16> %tmp0, %tmp1
+  ret <16 x i16> %tmp2
+}
+
+; CHECK-LABEL: ssubl2_4s
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32>
+define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) {
+  %tmp0 = sext <8 x i16> %a to <8 x i32>
+  %tmp1 = sext <8 x i16> %b to <8 x i32>
+  %tmp2 = sub <8 x i32> %tmp0, %tmp1
+  ret <8 x i32> %tmp2
+}
+
+; CHECK-LABEL: ssubl2_2d
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64>
+define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) {
+  %tmp0 = sext <4 x i32> %a to <4 x i64>
+  %tmp1 = sext <4 x i32> %b to <4 x i64>
+  %tmp2 = sub <4 x i64> %tmp0, %tmp1
+  ret <4 x i64> %tmp2
+}