Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -39,6 +39,61 @@
   Module *Mod;
   bool HasUnsafeFPMath;
 
+  /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to
+  /// binary operator \p V.
+  ///
+  /// \returns Binary operator \p V.
+  Value *copyFlags(const BinaryOperator &I, Value *V) const;
+
+  /// \returns Equivalent 16 bit integer type for given 32 bit integer type
+  /// \p T.
+  Type *getI16Ty(IRBuilder<> &B, const Type *T) const;
+
+  /// \returns Equivalent 32 bit integer type for given 16 bit integer type
+  /// \p T.
+  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
+
+  /// \returns True if the base element of type \p T is 16 bit integer, false
+  /// otherwise.
+  bool isI16Ty(const Type *T) const;
+
+  /// \returns True if the base element of type \p T is 32 bit integer, false
+  /// otherwise.
+  bool isI32Ty(const Type *T) const;
+
+  /// \returns True if binary operation \p I is a signed binary operation, false
+  /// otherwise.
+  bool isSigned(const BinaryOperator &I) const;
+
+  /// \returns True if the condition of 'select' operation \p I comes from a
+  /// signed 'icmp' operation, false otherwise.
+  bool isSigned(const SelectInst &I) const;
+
+  /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
+  /// binary operation by sign or zero extending operands to 32 bits, replacing
+  /// 16 bit operation with equivalent 32 bit operation, and truncating the
+  /// result of 32 bit operation back to 16 bits. 16 bit division operation is
+  /// not promoted.
+  ///
+  /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
+  /// binary operation, false otherwise.
+  bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
+
+  /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
+  /// operation by sign or zero extending operands to 32 bits, and replacing 16
+  /// bit operation with 32 bit operation.
+  ///
+  /// \returns True.
+  bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
+
+  /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
+  /// operation by sign or zero extending operands to 32 bits, replacing 16 bit
+  /// operation with 32 bit operation, and truncating the result of 32 bit
+  /// operation back to 16 bits.
+  ///
+  /// \returns True.
+  bool promoteUniformI16OpToI32Op(SelectInst &I) const;
+
 public:
   static char ID;
   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
@@ -51,9 +106,10 @@
 
   bool visitFDiv(BinaryOperator &I);
 
-  bool visitInstruction(Instruction &I) {
-    return false;
-  }
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitBinaryOperator(BinaryOperator &I);
+  bool visitICmpInst(ICmpInst &I);
+  bool visitSelectInst(SelectInst &I);
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -70,6 +126,150 @@
 
 } // End anonymous namespace
 
+Value *AMDGPUCodeGenPrepare::copyFlags(
+    const BinaryOperator &I, Value *V) const {
+  assert(isa<BinaryOperator>(V) && "V must be binary operator");
+
+  BinaryOperator *BinOp = cast<BinaryOperator>(V);
+  if (isa<OverflowingBinaryOperator>(BinOp)) {
+    BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
+    BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+  } else if (isa<PossiblyExactOperator>(BinOp)) {
+    BinOp->setIsExact(I.isExact());
+  }
+
+  return V;
+}
+
+Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const {
+  assert(isI32Ty(T) && "T must be 32 bits");
+
+  if (T->isIntegerTy())
+    return B.getInt16Ty();
+  return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements());
+}
+
+Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
+  assert(isI16Ty(T) && "T must be 16 bits");
+
+  if (T->isIntegerTy())
+    return B.getInt32Ty();
+  return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+}
+
+bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const {
+  if (T->isIntegerTy(16))
+    return true;
+  if (!T->isVectorTy())
+    return false;
+  return cast<VectorType>(T)->getElementType()->isIntegerTy(16);
+}
+
+bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const {
+  if (T->isIntegerTy(32))
+    return true;
+  if (!T->isVectorTy())
+    return false;
+  return cast<VectorType>(T)->getElementType()->isIntegerTy(32);
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
+  return I.getOpcode() == Instruction::SDiv ||
+      I.getOpcode() == Instruction::SRem;
+}
+
+bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
+  return isa<ICmpInst>(I.getOperand(0)) ?
+      cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
+  assert(isI16Ty(I.getType()) && "Op must be 16 bits");
+
+  if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv)
+    return false;
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Value *ExtOp0 = nullptr;
+  Value *ExtOp1 = nullptr;
+  Value *ExtRes = nullptr;
+  Value *TruncRes = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+  } else {
+    ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+  }
+  ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+  TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
+  assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
+  assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType());
+  Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType());
+  Value *ExtOp0 = nullptr;
+  Value *ExtOp1 = nullptr;
+  Value *NewICmp  = nullptr;
+
+  if (I.isSigned()) {
+    ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0);
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1);
+  } else {
+    ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0);
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1);
+  }
+  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
+
+  I.replaceAllUsesWith(NewICmp);
+  I.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
+  assert(isI16Ty(I.getType()) && "Op must be 16 bits");
+
+  IRBuilder<> Builder(&I);
+  Builder.SetCurrentDebugLocation(I.getDebugLoc());
+
+  Type *I32Ty = getI32Ty(Builder, I.getType());
+  Value *ExtOp1 = nullptr;
+  Value *ExtOp2 = nullptr;
+  Value *ExtRes = nullptr;
+  Value *TruncRes = nullptr;
+
+  if (isSigned(I)) {
+    ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
+    ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
+  } else {
+    ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
+    ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
+  }
+  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
+  TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
+
+  I.replaceAllUsesWith(TruncRes);
+  I.eraseFromParent();
+
+  return true;
+}
+
 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
   if (!CNum)
@@ -154,6 +354,37 @@
   return Attr.getValueAsString() == "true";
 }
 
+bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (!ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
+    Changed |= promoteUniformI16OpToI32Op(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (!ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
+          isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
+    Changed |= promoteUniformI16OpToI32Op(I);
+
+  return Changed;
+}
+
+bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
+  bool Changed = false;
+
+  // TODO: Should we promote smaller types that will be legalized to i16?
+  if (!ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
+    Changed |= promoteUniformI16OpToI32Op(I);
+
+  return Changed;
+}
+
 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
   Mod = &M;
   return false;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -540,6 +540,10 @@
 
 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
 
+  // i16 is not desirable unless it is a load or a store.
+  if (VT == MVT::i16 && Op != ISD::LOAD && Op != ISD::STORE)
+    return false;
+
   // SimplifySetCC uses this function to determine whether or not it should
   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
   if (VT == MVT::i1 && Op == ISD::SETCC)
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -0,0 +1,246 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
+; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
+; Make sure this doesn't crash with no triple
+
+; NOOP-LABEL: @noop_fdiv_fpmath(
+; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
+define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
+; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath(
+; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
+; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
+; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
+; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
+; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
+; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
+define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+  %no.md = fdiv float 1.0, %x
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float 1.0, %x, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp float 1.0, %x
+  store volatile float %arcp.no.md, float addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
+  store volatile float %arcp.25ulp, float addrspace(1)* %out
+
+  %fast.no.md = fdiv fast float 1.0, %x
+  store volatile float %fast.no.md, float addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
+  store volatile float %fast.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+
+; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
+; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
+; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
+; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
+; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
+; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
+; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
+; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
+define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+  %no.md = fdiv <2 x float> %a, %b
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
+  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
+
+  %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
+  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
+
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: extractelement <2 x float> %x
+; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
+  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
+; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
+; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
+; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+
+  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
+  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
+
+  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; FIXME: Should be able to get fdiv for 1.0 component
+; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %arcp.25ulp
+
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
+; CHECK: store volatile <2 x float> %fast.25ulp
+define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
+
+  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+
+  %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
+  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
+
+  ret void
+}
+
+; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
+; CHECK: %no.md = fdiv float %a, %b{{$}}
+; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
+; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
+; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
+; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
+; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
+define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+  %no.md = fdiv float %a, %b
+  store volatile float %no.md, float addrspace(1)* %out
+
+  %md.half.ulp = fdiv float %a, %b, !fpmath !1
+  store volatile float %md.half.ulp, float addrspace(1)* %out
+
+  %md.1ulp = fdiv float %a, %b, !fpmath !2
+  store volatile float %md.1ulp, float addrspace(1)* %out
+
+  %md.25ulp = fdiv float %a, %b, !fpmath !0
+  store volatile float %md.25ulp, float addrspace(1)* %out
+
+  %md.3ulp = fdiv float %a, %b, !fpmath !3
+  store volatile float %md.3ulp, float addrspace(1)* %out
+
+  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
+  store volatile float %fast.md.25ulp, float addrspace(1)* %out
+
+  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+
+  ret void
+}
+
+attributes #0 = { nounwind optnone noinline }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+
+; CHECK: !0 = !{float 2.500000e+00}
+; CHECK: !1 = !{float 5.000000e-01}
+; CHECK: !2 = !{float 1.000000e+00}
+; CHECK: !3 = !{float 3.000000e+00}
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 5.000000e-01}
+!2 = !{float 1.000000e+00}
+!3 = !{float 3.000000e+00}
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -0,0 +1,856 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s
+
+; VI-NOT: zext
+; VI-NOT: sext
+; VI-NOT: trunc
+
+; SI-LABEL: @add_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @add_i16(i16 %a, i16 %b) {
+  %r = add i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @add_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @add_nsw_i16(i16 %a, i16 %b) {
+  %r = add nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @add_nuw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @add_nuw_i16(i16 %a, i16 %b) {
+  %r = add nuw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @add_nuw_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) {
+  %r = add nuw nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @sub_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @sub_i16(i16 %a, i16 %b) {
+  %r = sub i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @sub_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @sub_nsw_i16(i16 %a, i16 %b) {
+  %r = sub nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @sub_nuw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @sub_nuw_i16(i16 %a, i16 %b) {
+  %r = sub nuw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @sub_nuw_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) {
+  %r = sub nuw nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @mul_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @mul_i16(i16 %a, i16 %b) {
+  %r = mul i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @mul_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @mul_nsw_i16(i16 %a, i16 %b) {
+  %r = mul nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @mul_nuw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @mul_nuw_i16(i16 %a, i16 %b) {
+  %r = mul nuw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @mul_nuw_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @mul_nuw_nsw_i16(i16 %a, i16 %b) {
+  %r = mul nuw nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @urem_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @urem_i16(i16 %a, i16 %b) {
+  %r = urem i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @srem_i16(
+; SI: %[[A_32:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = sext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @srem_i16(i16 %a, i16 %b) {
+  %r = srem i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @shl_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @shl_i16(i16 %a, i16 %b) {
+  %r = shl i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @shl_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @shl_nsw_i16(i16 %a, i16 %b) {
+  %r = shl nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @shl_nuw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @shl_nuw_i16(i16 %a, i16 %b) {
+  %r = shl nuw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @shl_nuw_nsw_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @shl_nuw_nsw_i16(i16 %a, i16 %b) {
+  %r = shl nuw nsw i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @lshr_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @lshr_i16(i16 %a, i16 %b) {
+  %r = lshr i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @lshr_exact_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @lshr_exact_i16(i16 %a, i16 %b) {
+  %r = lshr exact i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @ashr_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @ashr_i16(i16 %a, i16 %b) {
+  %r = ashr i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @ashr_exact_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @ashr_exact_i16(i16 %a, i16 %b) {
+  %r = ashr exact i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @and_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @and_i16(i16 %a, i16 %b) {
+  %r = and i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @or_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @or_i16(i16 %a, i16 %b) {
+  %r = or i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @xor_i16(
+; SI: %[[A_32:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32:[0-9]+]] = zext i16 %b to i32
+; SI: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
+; SI: ret i16 %[[R_16]]
+define i16 @xor_i16(i16 %a, i16 %b) {
+  %r = xor i16 %a, %b
+  ret i16 %r
+}
+
+; SI-LABEL: @select_eq_i16(
+; SI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_eq_i16(i16 %a, i16 %b) {
+  %cmp = icmp eq i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_ne_i16(
+; SI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_ne_i16(i16 %a, i16 %b) {
+  %cmp = icmp ne i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_ugt_i16(
+; SI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_ugt_i16(i16 %a, i16 %b) {
+  %cmp = icmp ugt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_uge_i16(
+; SI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_uge_i16(i16 %a, i16 %b) {
+  %cmp = icmp uge i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_ult_i16(
+; SI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_ult_i16(i16 %a, i16 %b) {
+  %cmp = icmp ult i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_ule_i16(
+; SI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_ule_i16(i16 %a, i16 %b) {
+  %cmp = icmp ule i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_sgt_i16(
+; SI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_sgt_i16(i16 %a, i16 %b) {
+  %cmp = icmp sgt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_sge_i16(
+; SI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_sge_i16(i16 %a, i16 %b) {
+  %cmp = icmp sge i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_slt_i16(
+; SI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_slt_i16(i16 %a, i16 %b) {
+  %cmp = icmp slt i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @select_sle_i16(
+; SI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
+; SI: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext i16 %a to i32
+; SI: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
+; SI: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
+; SI: ret i16 %[[SEL_16]]
+define i16 @select_sle_i16(i16 %a, i16 %b) {
+  %cmp = icmp sle i16 %a, %b
+  %sel = select i1 %cmp, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; SI-LABEL: @add_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = add <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @add_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = add nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @add_nuw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = add nuw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @add_nuw_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = add nuw nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @sub_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = sub <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @sub_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = sub nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @sub_nuw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = sub nuw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @sub_nuw_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = sub nuw nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @mul_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = mul <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @mul_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = mul nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @mul_nuw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = mul nuw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @mul_nuw_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = mul nuw nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @urem_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @urem_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = urem <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @srem_3xi16(
+; SI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = srem <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @shl_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = shl <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @shl_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = shl nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @shl_nuw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = shl nuw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @shl_nuw_nsw_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = shl nuw nsw <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @lshr_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = lshr <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @lshr_exact_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = lshr exact <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @ashr_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = ashr <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @ashr_exact_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = ashr exact <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @and_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = and <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @or_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = or <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @xor_3xi16(
+; SI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]]
+; SI: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[R_16]]
+define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %r = xor <3 x i16> %a, %b
+  ret <3 x i16> %r
+}
+
+; SI-LABEL: @select_eq_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp eq <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_ne_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp ne <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_ugt_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp ugt <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_uge_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp uge <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_ult_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp ult <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_ule_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp ule <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_sgt_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp sgt <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_sge_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp sge <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_slt_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp slt <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
+
+; SI-LABEL: @select_sle_3xi16(
+; SI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]]
+; SI: %[[A_32_1:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
+; SI: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
+; SI: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
+; SI: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
+; SI: ret <3 x i16> %[[SEL_16]]
+define <3 x i16> @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
+  %cmp = icmp sle <3 x i16> %a, %b
+  %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  ret <3 x i16> %sel
+}
Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
===================================================================
--- test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll
+++ /dev/null
@@ -1,246 +0,0 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s
-; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s
-; Make sure this doesn't crash with no triple
-
-; NOOP-LABEL: @noop_fdiv_fpmath(
-; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
-  %md.25ulp = fdiv float %a, %b, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
-  ret void
-}
-
-; CHECK-LABEL: @fdiv_fpmath(
-; CHECK: %no.md = fdiv float %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
-; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
-; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
-  %no.md = fdiv float %a, %b
-  store volatile float %no.md, float addrspace(1)* %out
-
-  %md.half.ulp = fdiv float %a, %b, !fpmath !1
-  store volatile float %md.half.ulp, float addrspace(1)* %out
-
-  %md.1ulp = fdiv float %a, %b, !fpmath !2
-  store volatile float %md.1ulp, float addrspace(1)* %out
-
-  %md.25ulp = fdiv float %a, %b, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
-
-  %md.3ulp = fdiv float %a, %b, !fpmath !3
-  store volatile float %md.3ulp, float addrspace(1)* %out
-
-  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-  store volatile float %fast.md.25ulp, float addrspace(1)* %out
-
-  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
-  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
-
-  ret void
-}
-
-; CHECK-LABEL: @rcp_fdiv_fpmath(
-; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
-; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0
-; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}}
-; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
-; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
-; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
-  %no.md = fdiv float 1.0, %x
-  store volatile float %no.md, float addrspace(1)* %out
-
-  %md.25ulp = fdiv float 1.0, %x, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
-
-  %md.half.ulp = fdiv float 1.0, %x, !fpmath !1
-  store volatile float %md.half.ulp, float addrspace(1)* %out
-
-  %arcp.no.md = fdiv arcp float 1.0, %x
-  store volatile float %arcp.no.md, float addrspace(1)* %out
-
-  %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
-  store volatile float %arcp.25ulp, float addrspace(1)* %out
-
-  %fast.no.md = fdiv fast float 1.0, %x
-  store volatile float %fast.no.md, float addrspace(1)* %out
-
-  %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0
-  store volatile float %fast.25ulp, float addrspace(1)* %out
-
-  ret void
-}
-
-; CHECK-LABEL: @fdiv_fpmath_vector(
-; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
-
-; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
-; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
-; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
-; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
-; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
-; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
-  %no.md = fdiv <2 x float> %a, %b
-  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
-
-  %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1
-  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
-
-  %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2
-  store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
-
-  %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0
-  store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out
-
-  ret void
-}
-
-; CHECK-LABEL: @rcp_fdiv_fpmath_vector(
-; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %md.half.ulp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %x{{$}}
-
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: extractelement <2 x float> %x
-; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
-; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
-  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
-
-  %md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
-  store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
-
-  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
-
-  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
-  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
-
-  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
-  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
-
-  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
-  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-
-  ret void
-}
-
-; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat(
-; CHECK: %no.md = fdiv <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
-; CHECK: %arcp.no.md = fdiv arcp <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x
-; CHECK: %fast.no.md = fdiv fast <2 x float> <float 1.000000e+00, float 2.000000e+00>, %x{{$}}
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0
-; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
-; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
-  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
-
-  %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
-
-  %fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
-  store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
-
-  %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
-  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
-
-  %fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
-  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-
-  ret void
-}
-
-; FIXME: Should be able to get fdiv for 1.0 component
-; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: store volatile <2 x float> %arcp.25ulp
-
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
-; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
-  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
-
-  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
-  store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
-
-  %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
-  store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-
-  ret void
-}
-
-; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
-; CHECK: %no.md = fdiv float %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
-; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
-; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
-; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
-  %no.md = fdiv float %a, %b
-  store volatile float %no.md, float addrspace(1)* %out
-
-  %md.half.ulp = fdiv float %a, %b, !fpmath !1
-  store volatile float %md.half.ulp, float addrspace(1)* %out
-
-  %md.1ulp = fdiv float %a, %b, !fpmath !2
-  store volatile float %md.1ulp, float addrspace(1)* %out
-
-  %md.25ulp = fdiv float %a, %b, !fpmath !0
-  store volatile float %md.25ulp, float addrspace(1)* %out
-
-  %md.3ulp = fdiv float %a, %b, !fpmath !3
-  store volatile float %md.3ulp, float addrspace(1)* %out
-
-  %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
-  store volatile float %fast.md.25ulp, float addrspace(1)* %out
-
-  %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
-  store volatile float %arcp.md.25ulp, float addrspace(1)* %out
-
-  ret void
-}
-
-attributes #0 = { nounwind optnone noinline }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "target-features"="+fp32-denormals" }
-
-; CHECK: !0 = !{float 2.500000e+00}
-; CHECK: !1 = !{float 5.000000e-01}
-; CHECK: !2 = !{float 1.000000e+00}
-; CHECK: !3 = !{float 3.000000e+00}
-
-!0 = !{float 2.500000e+00}
-!1 = !{float 5.000000e-01}
-!2 = !{float 1.000000e+00}
-!3 = !{float 3.000000e+00}
Index: test/CodeGen/AMDGPU/ctlz.ll
===================================================================
--- test/CodeGen/AMDGPU/ctlz.ll
+++ test/CodeGen/AMDGPU/ctlz.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
@@ -17,13 +17,13 @@
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; FUNC-LABEL: {{^}}s_ctlz_i32:
-; SI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]]
-; SI-DAG: v_cmp_eq_i32_e64 [[CMPZ:s\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
-; SI-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[VCTLZ]], 32, [[CMPZ]]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
+; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_flbit_i32_b32 [[CTLZ:s[0-9]+]], [[VAL]]
+; GCN-DAG: v_cmp_eq_i32_e64 [[CMPZ:s\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VCTLZ:v[0-9]+]], [[CTLZ]]
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[VCTLZ]], 32, [[CMPZ]]
+; GCN: buffer_store_dword [[RESULT]]
+; GCN: s_endpgm
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
@@ -34,12 +34,12 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
-; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
+; GCN-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], [[CTLZ]], 32, vcc
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
@@ -51,11 +51,11 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_v2i32:
-; SI: buffer_load_dwordx2
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
+; GCN: buffer_load_dwordx2
+; GCN: v_ffbh_u32_e32
+; GCN: v_ffbh_u32_e32
+; GCN: buffer_store_dwordx2
+; GCN: s_endpgm
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
@@ -69,13 +69,13 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_v4i32:
-; SI: buffer_load_dwordx4
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: v_ffbh_u32_e32
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN: buffer_load_dwordx4
+; GCN: v_ffbh_u32_e32
+; GCN: v_ffbh_u32_e32
+; GCN: v_ffbh_u32_e32
+; GCN: v_ffbh_u32_e32
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
 
 
 ; EG-DAG: FFBH_UINT
@@ -97,12 +97,12 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i8:
-; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
-; SI-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
-; SI-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc
-; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]]
-; SI: buffer_store_byte [[RESULT]],
+; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN-DAG: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; GCN-DAG: v_cmp_eq_i32_e32 vcc, 0, [[CTLZ]]
+; GCN-DAG: v_cndmask_b32_e64 [[CORRECTED_FFBH:v[0-9]+]], [[FFBH]], 32, vcc
+; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xffffffe8, [[CORRECTED_FFBH]]
+; GCN: buffer_store_byte [[RESULT]],
 define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
@@ -111,16 +111,16 @@
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64:
-; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI-DAG: v_cmp_eq_i32_e64 vcc, s[[HI]], 0{{$}}
-; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
-; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
-; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
-; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
-; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
-; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
-; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
+; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: v_cmp_eq_i32_e64 vcc, s[[HI]], 0{{$}}
+; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
+; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
+; GCN-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
+; GCN-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
+; GCN-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
+; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
+; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
 define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
@@ -136,17 +136,17 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64:
-; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
-; SI-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; SI-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
-; SI-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
-; SI-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
-; SI-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
-; SI-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
-; SI-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
-; SI-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
-; SI-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
-; SI: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN-DAG: v_cmp_eq_i32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
+; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
+; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
+; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
+; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[ADD]], [[CMPHI]]
+; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
+; GCN-DAG: v_cmp_eq_i32_e32 vcc, 0, [[OR]]
+; GCN-DAG: v_cndmask_b32_e64 v[[CLTZ_LO:[0-9]+]], v[[CTLZ:[0-9]+]], 64, vcc
+; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
 define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -170,10 +170,10 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
  define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
@@ -184,10 +184,10 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
 define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
@@ -199,11 +199,11 @@
 
 ; TODO: Should be able to eliminate select here as well.
 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
-; SI: buffer_load_dword
-; SI: v_ffbh_u32_e32
-; SI: v_cmp
-; SI: v_cndmask
-; SI: s_endpgm
+; GCN: buffer_load_dword
+; GCN: v_ffbh_u32_e32
+; GCN: v_cmp
+; GCN: v_cndmask
+; GCN: s_endpgm
 define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
@@ -214,11 +214,11 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
-; SI: buffer_load_dword
-; SI: v_ffbh_u32_e32
-; SI: v_cmp
-; SI: v_cndmask
-; SI: s_endpgm
+; GCN: buffer_load_dword
+; GCN: v_ffbh_u32_e32
+; GCN: v_cmp
+; GCN: v_cndmask
+; GCN: s_endpgm
 define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
@@ -229,9 +229,9 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i8_sel_eq_neg1:
-; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
-; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; SI: buffer_store_byte [[FFBH]],
+; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; GCN: buffer_store_byte [[FFBH]],
  define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
@@ -242,9 +242,9 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
-; SI: buffer_load_ushort [[VAL:v[0-9]+]],
-; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; SI: buffer_store_short [[FFBH]],
+; VI: buffer_load_ushort [[VAL:v[0-9]+]],
+; VI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; VI: buffer_store_short [[FFBH]],
  define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
@@ -255,10 +255,10 @@
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i7_sel_eq_neg1:
-; SI: buffer_load_ubyte [[VAL:v[0-9]+]],
-; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
-; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
-; SI: buffer_store_byte [[TRUNC]],
+; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
+; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
+; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
+; GCN: buffer_store_byte [[TRUNC]],
  define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %val = load i7, i7 addrspace(1)* %valptr
   %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
Index: test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -0,0 +1,212 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
+
+; FUNC-LABEL: {{^}}test_umul24_i32:
+; GCN: v_mul_u32_u24
+define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umul24_i16_sext:
+; SI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}}
+; SI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]]
+; VI: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; VI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
+define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+entry:
+  %mul = mul i16 %a, %b
+  %ext = sext i16 %mul to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umul24_i16_vgpr_sext:
+; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
+define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
+  %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
+  %ptr_b = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.y
+  %a = load i16, i16 addrspace(1)* %ptr_a
+  %b = load i16, i16 addrspace(1)* %ptr_b
+  %mul = mul i16 %a, %b
+  %val = sext i16 %mul to i32
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umul24_i16:
+; SI: s_mul_i32
+; SI: s_and_b32
+; SI: v_mov_b32_e32
+; VI: s_and_b32
+; VI: v_mul_u32_u24_e32
+; VI: v_and_b32_e32
+define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+entry:
+  %mul = mul i16 %a, %b
+  %ext = zext i16 %mul to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umul24_i16_vgpr:
+; GCN: v_mul_u32_u24_e32
+; GCN: v_and_b32_e32
+define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
+  %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
+  %ptr_b = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.y
+  %a = load i16, i16 addrspace(1)* %ptr_a
+  %b = load i16, i16 addrspace(1)* %ptr_b
+  %mul = mul i16 %a, %b
+  %val = zext i16 %mul to i32
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umul24_i8:
+; GCN: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+entry:
+  %mul = mul i8 %a, %b
+  %ext = sext i8 %mul to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
+; GCN-NOT: and
+; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %a.24 = and i32 %a, 16777215
+  %b.24 = and i32 %b, 16777215
+  %a.24.i64 = zext i32 %a.24 to i64
+  %b.24.i64 = zext i32 %b.24 to i64
+  %mul48 = mul i64 %a.24.i64, %b.24.i64
+  %mul48.hi = lshr i64 %mul48, 32
+  %mul24hi = trunc i64 %mul48.hi to i32
+  store i32 %mul24hi, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umulhi24:
+; GCN-NOT: and
+; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
+; GCN-NEXT: buffer_store_dword [[RESULT]]
+define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %a.24 = and i64 %a, 16777215
+  %b.24 = and i64 %b, 16777215
+  %mul48 = mul i64 %a.24, %b.24
+  %mul48.hi = lshr i64 %mul48, 32
+  %mul24.hi = trunc i64 %mul48.hi to i32
+  store i32 %mul24.hi, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply with 24-bit inputs and 64-bit output.
+; FUNC-LABEL: {{^}}test_umul24_i64:
+; GCN-NOT: and
+; GCN-NOT: lshr
+; GCN-DAG: v_mul_u32_u24_e32
+; GCN-DAG: v_mul_hi_u32_u24_e32
+; GCN: buffer_store_dwordx2
+define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %tmp0 = shl i64 %a, 40
+  %a_24 = lshr i64 %tmp0, 40
+  %tmp1 = shl i64 %b, 40
+  %b_24 = lshr i64 %tmp1, 40
+  %tmp2 = mul i64 %a_24, %b_24
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Should be able to eliminate the and.
+; FUNC-LABEL: {{^}}test_umul24_i64_square:
+; GCN: s_load_dword [[A:s[0-9]+]]
+; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[A]], 0xffffff{{$}}
+; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
+; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
+define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+entry:
+  %tmp0 = shl i64 %a, 40
+  %a.24 = lshr i64 %tmp0, 40
+  %tmp2 = mul i64 %a.24, %a.24
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umulhi16_i32:
+; GCN: s_and_b32
+; GCN: s_and_b32
+; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
+define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %a.16 = and i32 %a, 65535
+  %b.16 = and i32 %b, 65535
+  %mul = mul i32 %a.16, %b.16
+  %hi = lshr i32 %mul, 16
+  %mulhi = trunc i32 %hi to i16
+  store i16 %mulhi, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umul24_i33:
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: and
+; GCN-NOT: lshr
+; GCN-DAG: v_mul_u32_u24_e32 v[[MUL_LO:[0-9]+]],
+; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
+; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
+define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
+entry:
+  %tmp0 = shl i33 %a, 9
+  %a_24 = lshr i33 %tmp0, 9
+  %tmp1 = shl i33 %b, 9
+  %b_24 = lshr i33 %tmp1, 9
+  %tmp2 = mul i33 %a_24, %b_24
+  %ext = zext i33 %tmp2 to i64
+  store i64 %ext, i64 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umulhi24_i33:
+; GCN: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: and
+; GCN-NOT: lshr
+; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
+; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
+; GCN-NEXT: buffer_store_dword v[[HI]]
+define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+entry:
+  %tmp0 = shl i33 %a, 9
+  %a_24 = lshr i33 %tmp0, 9
+  %tmp1 = shl i33 %b, 9
+  %b_24 = lshr i33 %tmp1, 9
+  %tmp2 = mul i33 %a_24, %b_24
+  %hi = lshr i33 %tmp2, 32
+  %trunc = trunc i33 %hi to i32
+  store i32 %trunc, i32 addrspace(1)* %out
+  ret void
+}
Index: test/CodeGen/AMDGPU/mul_uint24-r600.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/mul_uint24-r600.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}test_umul24_i32:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = shl i32 %a, 8
+  %a_24 = lshr i32 %0, 8
+  %1 = shl i32 %b, 8
+  %b_24 = lshr i32 %1, 8
+  %2 = mul i32 %a_24, %b_24
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; The result must be sign-extended.
+; FUNC-LABEL: {{^}}test_umul24_i16_sext:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+; EG: 16
+define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+entry:
+  %mul = mul i16 %a, %b
+  %ext = sext i16 %mul to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; The result must be sign-extended.
+; FUNC-LABEL: {{^}}test_umul24_i8:
+; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
+; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
+define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+entry:
+  %mul = mul i8 %a, %b
+  %ext = sext i8 %mul to i32
+  store i32 %ext, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
+; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
+define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %a.24 = and i32 %a, 16777215
+  %b.24 = and i32 %b, 16777215
+  %a.24.i64 = zext i32 %a.24 to i64
+  %b.24.i64 = zext i32 %b.24 to i64
+  %mul48 = mul i64 %a.24.i64, %b.24.i64
+  %mul48.hi = lshr i64 %mul48, 32
+  %mul24hi = trunc i64 %mul48.hi to i32
+  store i32 %mul24hi, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_umulhi24:
+; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %a.24 = and i64 %a, 16777215
+  %b.24 = and i64 %b, 16777215
+  %mul48 = mul i64 %a.24, %b.24
+  %mul48.hi = lshr i64 %mul48, 32
+  %mul24.hi = trunc i64 %mul48.hi to i32
+  store i32 %mul24.hi, i32 addrspace(1)* %out
+  ret void
+}
+
+; Multiply with 24-bit inputs and 64-bit output.
+; FUNC-LABEL: {{^}}test_umul24_i64:
+; EG; MUL_UINT24
+; EG: MULHI
+define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %tmp0 = shl i64 %a, 40
+  %a_24 = lshr i64 %tmp0, 40
+  %tmp1 = shl i64 %b, 40
+  %b_24 = lshr i64 %tmp1, 40
+  %tmp2 = mul i64 %a_24, %b_24
+  store i64 %tmp2, i64 addrspace(1)* %out
+  ret void
+}
Index: test/CodeGen/AMDGPU/mul_uint24.ll
===================================================================
--- test/CodeGen/AMDGPU/mul_uint24.ll
+++ /dev/null
@@ -1,197 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-; FUNC-LABEL: {{^}}test_umul24_i32:
-; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI: v_mul_u32_u24
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %0 = shl i32 %a, 8
-  %a_24 = lshr i32 %0, 8
-  %1 = shl i32 %b, 8
-  %b_24 = lshr i32 %1, 8
-  %2 = mul i32 %a_24, %b_24
-  store i32 %2, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umul24_i16_sext:
-; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; EG: 16
-
-; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
-entry:
-  %mul = mul i16 %a, %b
-  %ext = sext i16 %mul to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umul24_i16:
-; SI: s_and_b32
-; SI: v_mul_u32_u24_e32
-; SI: v_and_b32_e32
-define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
-entry:
-  %mul = mul i16 %a, %b
-  %ext = zext i16 %mul to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umul24_i8:
-; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-
-define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
-entry:
-  %mul = mul i8 %a, %b
-  %ext = sext i8 %mul to i32
-  store i32 %ext, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
-; SI-NOT: and
-; SI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
-; SI-NEXT: buffer_store_dword [[RESULT]]
-
-; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %a.24 = and i32 %a, 16777215
-  %b.24 = and i32 %b, 16777215
-  %a.24.i64 = zext i32 %a.24 to i64
-  %b.24.i64 = zext i32 %b.24 to i64
-  %mul48 = mul i64 %a.24.i64, %b.24.i64
-  %mul48.hi = lshr i64 %mul48, 32
-  %mul24hi = trunc i64 %mul48.hi to i32
-  store i32 %mul24hi, i32 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umulhi24:
-; SI-NOT: and
-; SI: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
-; SI-NEXT: buffer_store_dword [[RESULT]]
-
-; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %a.24 = and i64 %a, 16777215
-  %b.24 = and i64 %b, 16777215
-  %mul48 = mul i64 %a.24, %b.24
-  %mul48.hi = lshr i64 %mul48, 32
-  %mul24.hi = trunc i64 %mul48.hi to i32
-  store i32 %mul24.hi, i32 addrspace(1)* %out
-  ret void
-}
-
-; Multiply with 24-bit inputs and 64-bit output
-; FUNC-LABEL: {{^}}test_umul24_i64:
-; EG; MUL_UINT24
-; EG: MULHI
-
-; SI-NOT: and
-; SI-NOT: lshr
-
-; SI-DAG: v_mul_u32_u24_e32
-; SI-DAG: v_mul_hi_u32_u24_e32
-
-; SI: buffer_store_dwordx2
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
-entry:
-  %tmp0 = shl i64 %a, 40
-  %a_24 = lshr i64 %tmp0, 40
-  %tmp1 = shl i64 %b, 40
-  %b_24 = lshr i64 %tmp1, 40
-  %tmp2 = mul i64 %a_24, %b_24
-  store i64 %tmp2, i64 addrspace(1)* %out
-  ret void
-}
-
-; FIXME: Should be able to eliminate the and
-; FUNC-LABEL: {{^}}test_umul24_i64_square:
-; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_and_b32 [[TRUNC:s[0-9]+]], [[A]], 0xffffff{{$}}
-; SI-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
-; SI-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[TRUNC]], [[TRUNC]]
-define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
-entry:
-  %tmp0 = shl i64 %a, 40
-  %a.24 = lshr i64 %tmp0, 40
-  %tmp2 = mul i64 %a.24, %a.24
-  store i64 %tmp2, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umulhi16_i32:
-; SI: s_and_b32
-; SI: s_and_b32
-; SI: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
-define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
-entry:
-  %a.16 = and i32 %a, 65535
-  %b.16 = and i32 %b, 65535
-  %mul = mul i32 %a.16, %b.16
-  %hi = lshr i32 %mul, 16
-  %mulhi = trunc i32 %hi to i16
-  store i16 %mulhi, i16 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umul24_i33:
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; SI-NOT: and
-; SI-NOT: lshr
-
-; SI-DAG: v_mul_u32_u24_e32 v[[MUL_LO:[0-9]+]],
-; SI-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
-; SI-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
-; SI: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
-define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
-entry:
-  %tmp0 = shl i33 %a, 9
-  %a_24 = lshr i33 %tmp0, 9
-  %tmp1 = shl i33 %b, 9
-  %b_24 = lshr i33 %tmp1, 9
-  %tmp2 = mul i33 %a_24, %b_24
-  %ext = zext i33 %tmp2 to i64
-  store i64 %ext, i64 addrspace(1)* %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}test_umulhi24_i33:
-; SI: s_load_dword s
-; SI: s_load_dword s
-
-; SI-NOT: and
-; SI-NOT: lshr
-
-; SI: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
-; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
-; SI-NEXT: buffer_store_dword v[[HI]]
-define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
-entry:
-  %tmp0 = shl i33 %a, 9
-  %a_24 = lshr i33 %tmp0, 9
-  %tmp1 = shl i33 %b, 9
-  %b_24 = lshr i33 %tmp1, 9
-  %tmp2 = mul i33 %a_24, %b_24
-  %hi = lshr i33 %tmp2, 32
-  %trunc = trunc i33 %hi to i32
-  store i32 %trunc, i32 addrspace(1)* %out
-  ret void
-}
Index: test/CodeGen/AMDGPU/sad.ll
===================================================================
--- test/CodeGen/AMDGPU/sad.ll
+++ test/CodeGen/AMDGPU/sad.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
-; GCN-LABEL: {{^}}v_sad_u32_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -16,7 +17,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_constant_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
 define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
   %icmp0 = icmp ugt i32 %a, 90
@@ -32,7 +33,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_pat2:
+; FUNC-LABEL: {{^}}v_sad_u32_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -46,7 +47,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_multi_use_sub_pat1:
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
@@ -66,7 +67,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -82,7 +83,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -99,7 +100,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -117,7 +118,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
+; FUNC-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -132,7 +133,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
+; FUNC-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
@@ -149,7 +150,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_vector_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_vector_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -168,7 +169,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_vector_pat2:
+; FUNC-LABEL: {{^}}v_sad_u32_vector_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@@ -185,7 +186,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_i16_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 
@@ -202,8 +203,8 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; FUNC-LABEL: {{^}}v_sad_u32_i16_pat2:
+; VI: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
@@ -216,7 +217,22 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_2xi16_pat2:
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @v_sad_u32_2xi16_pat2(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+  %icmp0 = icmp ugt <2 x i16> %a, %b
+  %sub0 = sub <2 x i16> %a, %b
+  %sub1 = sub <2 x i16> %b, %a
+  %ret0 = select <2 x i1> %icmp0, <2 x i16> %sub0, <2 x i16> %sub1
+
+  %ret = add <2 x i16> %ret0, %c
+
+  store <2 x i16> %ret, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sad_u32_i8_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
   %icmp0 = icmp ugt i8 %a, %b
@@ -232,7 +248,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
+; FUNC-LABEL: {{^}}v_sad_u32_i8_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
@@ -246,7 +262,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
+; FUNC-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
 ; GCN: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
@@ -265,7 +281,7 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
+; FUNC-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}