Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
===================================================================
--- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -197,6 +197,10 @@
     return isRegKind() || isInlinableImm(type);
   }
 
+  bool isRegOrImmWithInt16InputMods() const {
+    return isRegOrImmWithInputMods(MVT::i16);
+  }
+
   bool isRegOrImmWithInt32InputMods() const {
     return isRegOrImmWithInputMods(MVT::i32);
   }
@@ -205,6 +209,10 @@
     return isRegOrImmWithInputMods(MVT::i64);
   }
 
+  bool isRegOrImmWithFP16InputMods() const {
+    return isRegOrImmWithInputMods(MVT::f16);
+  }
+
   bool isRegOrImmWithFP32InputMods() const {
     return isRegOrImmWithInputMods(MVT::f32);
   }
@@ -256,6 +264,10 @@
 
   bool isRegClass(unsigned RCID) const;
 
+  bool isSCSrcB16() const {
+    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
+  }
+
   bool isSCSrcB32() const {
     return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
   }
@@ -264,6 +276,10 @@
     return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
   }
 
+  bool isSCSrcF16() const {
+    return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
+  }
+
   bool isSCSrcF32() const {
     return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
   }
@@ -276,6 +292,10 @@
     return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr();
   }
 
+  bool isSSrcB16() const {
+    return isSCSrcB16() || isLiteralImm(MVT::i16);
+  }
+
   bool isSSrcB64() const {
     // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
     // See isVSrc64().
@@ -290,6 +310,10 @@
     return isSCSrcB64() || isLiteralImm(MVT::f64);
   }
 
+  bool isSSrcF16() const {
+    return isSCSrcB16() || isLiteralImm(MVT::f16);
+  }
+
   bool isVCSrcB32() const {
     return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
   }
@@ -298,6 +322,10 @@
     return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
   }
 
+  bool isVCSrcB16() const {
+    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
+  }
+
   bool isVCSrcF32() const {
     return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
   }
@@ -306,6 +334,10 @@
     return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
   }
 
+  bool isVCSrcF16() const {
+    return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
+  }
+
   bool isVSrcB32() const {
     return isVCSrcF32() || isLiteralImm(MVT::i32);
   }
@@ -314,6 +346,10 @@
     return isVCSrcF64() || isLiteralImm(MVT::i64);
   }
 
+  bool isVSrcB16() const {
+    return isVCSrcF16() || isLiteralImm(MVT::i16);
+  }
+
   bool isVSrcF32() const {
     return isVCSrcF32() || isLiteralImm(MVT::f32);
   }
@@ -322,10 +358,18 @@
     return isVCSrcF64() || isLiteralImm(MVT::f64);
   }
 
+  bool isVSrcF16() const {
+    return isVCSrcF16() || isLiteralImm(MVT::f16);
+  }
+
   bool isKImmFP32() const {
     return isLiteralImm(MVT::f32);
   }
 
+  bool isKImmFP16() const {
+    return isLiteralImm(MVT::f16);
+  }
+
   bool isMem() const override {
     return false;
   }
@@ -414,7 +458,16 @@
 
   void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
 
-  void addKImmFP32Operands(MCInst &Inst, unsigned N) const;
+  template <unsigned Bitwidth>
+  void addKImmFPOperands(MCInst &Inst, unsigned N) const;
+
+  void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
+    addKImmFPOperands<16>(Inst, N);
+  }
+
+  void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
+    addKImmFPOperands<32>(Inst, N);
+  }
 
   void addRegOperands(MCInst &Inst, unsigned N) const;
 
@@ -783,19 +836,23 @@
 };
 
 // May be called with integer type with equivalent bitwidth.
-static const fltSemantics *getFltSemantics(MVT VT) {
-  switch (VT.getSizeInBits()) {
-  case 32:
+static const fltSemantics *getFltSemantics(unsigned Size) {
+  switch (Size) {
+  case 4:
     return &APFloat::IEEEsingle;
-  case 64:
+  case 8:
     return &APFloat::IEEEdouble;
-  case 16:
+  case 2:
     return &APFloat::IEEEhalf;
   default:
     llvm_unreachable("unsupported fp type");
   }
 }
 
+static const fltSemantics *getFltSemantics(MVT VT) {
+  return getFltSemantics(VT.getSizeInBits() / 8);
+}
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -854,6 +911,12 @@
                                         AsmParser->hasInv2PiInlineImm());
   }
 
+  if (type.getScalarSizeInBits() == 16) {
+    return AMDGPU::isInlinableLiteral16(
+      static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
+      AsmParser->hasInv2PiInlineImm());
+  }
+
   return AMDGPU::isInlinableLiteral32(
     static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
     AsmParser->hasInv2PiInlineImm());
@@ -868,9 +931,13 @@
   if (!Imm.IsFPImm) {
     // We got int literal token.
 
+    unsigned Size = type.getSizeInBits();
+    if (Size == 64)
+      Size = 32;
+
     // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
     // types.
-    return isUInt<32>(Imm.Val) || isInt<32>(Imm.Val);
+    return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
   }
 
   // We got fp literal token
@@ -906,7 +973,8 @@
     }
   }
 
-  if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) {
+  if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
+                             Inst.getNumOperands())) {
     addLiteralImmOperand(Inst, Val);
   } else {
     Inst.addOperand(MCOperand::createImm(Val));
@@ -919,69 +987,112 @@
   // Check that this operand accepts literals
   assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
 
-  APInt Literal(64, Val);
-  auto OpSize = AMDGPU::getRegOperandSize(AsmParser->getMRI(), InstDesc, OpNum); // expected operand size
+  auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
 
   if (Imm.IsFPImm) { // We got fp literal token
-    if (OpSize == 8) { // Expected 64-bit operand
-      // Check if literal is inlinable
+    APInt Literal(64, Val);
+
+    switch (OpSize) {
+    case 8: {
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
-      } else if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
+        return;
+      }
+
+      // Non-inlineable
+      if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
         // For fp operands we check if low 32 bits are zeros
         if (Literal.getLoBits(32) != 0) {
           const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
-                            "Can't encode literal as exact 64-bit"
-                            " floating-point operand. Low 32-bits will be"
-                            " set to zero");
+          "Can't encode literal as exact 64-bit floating-point operand. "
+          "Low 32-bits will be set to zero");
         }
+
         Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
-      } else {
-        // We don't allow fp literals in 64-bit integer instructions. It is
-        // unclear how we should encode them. This case should be checked earlier
-        // in predicate methods (isLiteralImm())
-        llvm_unreachable("fp literal in 64-bit integer instruction.");
+        return;
       }
-    } else { // Expected 32-bit operand
+
+      // We don't allow fp literals in 64-bit integer instructions. It is
+      // unclear how we should encode them. This case should be checked earlier
+      // in predicate methods (isLiteralImm())
+      llvm_unreachable("fp literal in 64-bit integer instruction.");
+    }
+    case 4:
+    case 2: {
       bool lost;
       APFloat FPLiteral(APFloat::IEEEdouble, Literal);
       // Convert literal to single precision
-      FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
+      FPLiteral.convert(*getFltSemantics(OpSize),
+                        APFloat::rmNearestTiesToEven, &lost);
       // We allow precision lost but not overflow or underflow. This should be
       // checked earlier in isLiteralImm()
       Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+      return;
     }
-  } else { // We got int literal token
-    if (OpSize == 8) { // Expected 64-bit operand
-      auto LiteralVal = Literal.getZExtValue();
-      if (AMDGPU::isInlinableLiteral64(LiteralVal,
-                                       AsmParser->hasInv2PiInlineImm())) {
-        Inst.addOperand(MCOperand::createImm(LiteralVal));
-        return;
-      }
-    } else { // Expected 32-bit operand
-      auto LiteralVal = static_cast<int32_t>(Literal.getLoBits(32).getZExtValue());
-      if (AMDGPU::isInlinableLiteral32(LiteralVal,
-                                       AsmParser->hasInv2PiInlineImm())) {
-        Inst.addOperand(MCOperand::createImm(LiteralVal));
-        return;
-      }
+    default:
+      llvm_unreachable("invalid operand size");
+    }
+
+    return;
+  }
+
+   // We got int literal token.
+  // Only sign extend inline immediates.
+  // FIXME: No errors on truncation
+  switch (OpSize) {
+  case 4: {
+    if (isInt<32>(Val) &&
+        AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
+                                     AsmParser->hasInv2PiInlineImm())) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
     }
-    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
+
+    Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
+    return;
+  }
+  case 8: {
+    if (AMDGPU::isInlinableLiteral64(Val,
+                                     AsmParser->hasInv2PiInlineImm())) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
+    }
+
+    Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+    return;
+  }
+  case 2: {
+    if (isInt<16>(Val) &&
+        AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
+                                     AsmParser->hasInv2PiInlineImm())) {
+      Inst.addOperand(MCOperand::createImm(Val));
+      return;
+    }
+
+    Inst.addOperand(MCOperand::createImm(Val & 0xffff));
+    return;
+  }
+  default:
+    llvm_unreachable("invalid operand size");
   }
 }
 
-void AMDGPUOperand::addKImmFP32Operands(MCInst &Inst, unsigned N) const {
+template <unsigned Bitwidth>
+void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
   APInt Literal(64, Imm.Val);
-  if (Imm.IsFPImm) { // We got fp literal
-    bool lost;
-    APFloat FPLiteral(APFloat::IEEEdouble, Literal);
-    FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
-    Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
-  } else { // We got int literal token
-    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
+
+  if (!Imm.IsFPImm) {
+    // We got int literal token.
+    Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
+    return;
   }
+
+  bool Lost;
+  APFloat FPLiteral(APFloat::IEEEdouble, Literal);
+  FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
+                    APFloat::rmNearestTiesToEven, &Lost);
+  Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
 }
 
 void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
===================================================================
--- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -62,6 +62,7 @@
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
+  MCOperand decodeOperand_VSrc16(unsigned Val) const;
 
   MCOperand decodeOperand_VReg_64(unsigned Val) const;
   MCOperand decodeOperand_VReg_96(unsigned Val) const;
@@ -79,6 +80,7 @@
     OPW32,
     OPW64,
     OPW128,
+    OPW16,
     OPW_LAST_,
     OPW_FIRST_ = OPW32
   };
@@ -87,7 +89,7 @@
   unsigned getTtmpClassId(const OpWidthTy Width) const;
 
   static MCOperand decodeIntImmed(unsigned Imm);
-  static MCOperand decodeFPImmed(bool Is32, unsigned Imm);
+  static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
   MCOperand decodeLiteralConstant() const;
 
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
===================================================================
--- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -88,6 +88,15 @@
 DECODE_OPERAND(SReg_256)
 DECODE_OPERAND(SReg_512)
 
+
+static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
+                                         unsigned Imm,
+                                         uint64_t Addr,
+                                         const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
+}
+
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
 #undef GET_SUBTARGETINFO_ENUM
@@ -250,6 +259,10 @@
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
+  return decodeSrcOp(OPW16, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
   // Some instructions have operand restrictions beyond what the encoding
   // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -324,28 +337,96 @@
       // Cast prevents negative overflow.
 }
 
-MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) {
+static int64_t getInlineImmVal32(unsigned Imm) {
+  switch (Imm) {
+  case 240:
+    return FloatToBits(0.5f);
+  case 241:
+    return FloatToBits(-0.5f);
+  case 242:
+    return FloatToBits(1.0f);
+  case 243:
+    return FloatToBits(-1.0f);
+  case 244:
+    return FloatToBits(2.0f);
+  case 245:
+    return FloatToBits(-2.0f);
+  case 246:
+    return FloatToBits(4.0f);
+  case 247:
+    return FloatToBits(-4.0f);
+  case 248: // 1 / (2 * PI)
+    return 0x3e22f983;
+  default:
+    llvm_unreachable("invalid fp inline imm");
+  }
+}
+
+static int64_t getInlineImmVal64(unsigned Imm) {
+  switch (Imm) {
+  case 240:
+    return DoubleToBits(0.5);
+  case 241:
+    return DoubleToBits(-0.5);
+  case 242:
+    return DoubleToBits(1.0);
+  case 243:
+    return DoubleToBits(-1.0);
+  case 244:
+    return DoubleToBits(2.0);
+  case 245:
+    return DoubleToBits(-2.0);
+  case 246:
+    return DoubleToBits(4.0);
+  case 247:
+    return DoubleToBits(-4.0);
+  case 248: // 1 / (2 * PI)
+    return 0x3fc45f306dc9c882;
+  default:
+    llvm_unreachable("invalid fp inline imm");
+  }
+}
+
+static int64_t getInlineImmVal16(unsigned Imm) {
+  switch (Imm) {
+  case 240:
+    return 0x3800;
+  case 241:
+    return 0xB800;
+  case 242:
+    return 0x3C00;
+  case 243:
+    return 0xBC00;
+  case 244:
+    return 0x4000;
+  case 245:
+    return 0xC000;
+  case 246:
+    return 0x4400;
+  case 247:
+    return 0xC400;
+  case 248: // 1 / (2 * PI)
+    return 0x3118;
+  default:
+    llvm_unreachable("invalid fp inline imm");
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
   assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
       && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
+
   // ToDo: case 248: 1/(2*PI) - is allowed only on VI
-  // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as
-  // literal constant.
-  float V = 0.0f;
-  switch (Imm) {
-  case 240: V =  0.5f; break;
-  case 241: V = -0.5f; break;
-  case 242: V =  1.0f; break;
-  case 243: V = -1.0f; break;
-  case 244: V =  2.0f; break;
-  case 245: V = -2.0f; break;
-  case 246: V =  4.0f; break;
-  case 247: V = -4.0f; break;
-  case 248: return MCOperand::createImm(Is32 ?         // 1/(2*PI)
-                                          0x3e22f983 :
-                                          0x3fc45f306dc9c882);
-  default: break;
+  switch (Width) {
+  case OPW32:
+    return MCOperand::createImm(getInlineImmVal32(Imm));
+  case OPW64:
+    return MCOperand::createImm(getInlineImmVal64(Imm));
+  case OPW16:
+    return MCOperand::createImm(getInlineImmVal16(Imm));
+  default:
+    llvm_unreachable("implement me");
   }
-  return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V));
 }
 
 unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
@@ -353,7 +434,9 @@
   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
   switch (Width) {
   default: // fall
-  case OPW32: return VGPR_32RegClassID;
+  case OPW32:
+  case OPW16:
+    return VGPR_32RegClassID;
   case OPW64: return VReg_64RegClassID;
   case OPW128: return VReg_128RegClassID;
   }
@@ -364,7 +447,9 @@
   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
   switch (Width) {
   default: // fall
-  case OPW32: return SGPR_32RegClassID;
+  case OPW32:
+  case OPW16:
+    return SGPR_32RegClassID;
   case OPW64: return SGPR_64RegClassID;
   case OPW128: return SGPR_128RegClassID;
   }
@@ -375,7 +460,9 @@
   assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
   switch (Width) {
   default: // fall
-  case OPW32: return TTMP_32RegClassID;
+  case OPW32:
+  case OPW16:
+    return TTMP_32RegClassID;
   case OPW64: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
   }
@@ -396,19 +483,26 @@
     return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
   }
 
-  assert(Width == OPW32 || Width == OPW64);
-  const bool Is32 = (Width == OPW32);
+  assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
 
   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
     return decodeIntImmed(Val);
 
   if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
-    return decodeFPImmed(Is32, Val);
+    return decodeFPImmed(Width, Val);
 
   if (Val == LITERAL_CONST)
     return decodeLiteralConstant();
 
-  return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val);
+  switch (Width) {
+  case OPW32:
+  case OPW16:
+    return decodeSpecialReg32(Val);
+  case OPW64:
+    return decodeSpecialReg64(Val);
+  default:
+    llvm_unreachable("unexpected immediate type");
+  }
 }
 
 MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
===================================================================
--- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -83,6 +83,8 @@
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
+                        raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -47,7 +47,13 @@
 void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
-  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+  // It's possible to end up with a 32-bit literal used with a 16-bit operand
+  // with ignored high bits. Print as 32-bit anyway in that case.
+  int64_t Imm = MI->getOperand(OpNo).getImm();
+  if (isInt<16>(Imm) || isUInt<16>(Imm))
+    O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
+  else
+    printU32ImmOperand(MI, OpNo, STI, O);
 }
 
 void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
@@ -322,6 +328,38 @@
   printOperand(MI, OpNo, STI, O);
 }
 
+void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O) {
+  int16_t SImm = static_cast<int16_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
+    return;
+  }
+
+  if (Imm == 0x3C00)
+    O<< "1.0";
+  else if (Imm == 0xBC00)
+    O<< "-1.0";
+  else if (Imm == 0x3800)
+    O<< "0.5";
+  else if (Imm == 0xB800)
+    O<< "-0.5";
+  else if (Imm == 0x4000)
+    O<< "2.0";
+  else if (Imm == 0xC000)
+    O<< "-2.0";
+  else if (Imm == 0x4400)
+    O<< "4.0";
+  else if (Imm == 0xC400)
+    O<< "-4.0";
+  else if (Imm == 0x3118) {
+    assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
+    O << "0.15915494";
+  } else
+    O << formatHex(static_cast<uint64_t>(Imm));
+}
+
 void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
@@ -417,22 +455,39 @@
     }
   } else if (Op.isImm()) {
     const MCInstrDesc &Desc = MII.get(MI->getOpcode());
-    int RCID = Desc.OpInfo[OpNo].RegClass;
-    if (RCID != -1) {
-      unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
-      if (RCBits == 32)
-        printImmediate32(Op.getImm(), STI, O);
-      else if (RCBits == 64)
-        printImmediate64(Op.getImm(), STI, O);
-      else
-        llvm_unreachable("Invalid register class size");
-    } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
+    switch (Desc.OpInfo[OpNo].OperandType) {
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case MCOI::OPERAND_IMMEDIATE:
       printImmediate32(Op.getImm(), STI, O);
-    } else {
+      break;
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+      printImmediate64(Op.getImm(), STI, O);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+      printImmediate16(Op.getImm(), STI, O);
+      break;
+    case MCOI::OPERAND_UNKNOWN:
+    case MCOI::OPERAND_PCREL:
+      O << formatDec(Op.getImm());
+      break;
+    case MCOI::OPERAND_REGISTER:
+      // FIXME: This should be removed and handled somewhere else. Seems to come
+      // from a disassembler bug.
+      O << "/*invalid immediate*/";
+      break;
+    default:
       // We hit this for the immediate instruction bits that don't yet have a
       // custom printer.
-      // TODO: Eventually this should be unnecessary.
-      O << formatDec(Op.getImm());
+      llvm_unreachable("unexpected immediate operand type");
     }
   } else if (Op.isFPImm()) {
     // We special case 0.0 because otherwise it will be printed as an integer.
Index: lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
===================================================================
--- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -39,7 +39,7 @@
   const MCRegisterInfo &MRI;
 
   /// \brief Encode an fp or int literal
-  uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize,
+  uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
                           const MCSubtargetInfo &STI) const;
 
 public:
@@ -87,6 +87,42 @@
   return 0;
 }
 
+static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
+  uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+  if (IntImm != 0)
+    return IntImm;
+
+  if (Val == 0x3800) // 0.5
+    return 240;
+
+  if (Val == 0xB800) // -0.5
+    return 241;
+
+  if (Val == 0x3C00) // 1.0
+    return 242;
+
+  if (Val == 0xBC00) // -1.0
+    return 243;
+
+  if (Val == 0x4000) // 2.0
+    return 244;
+
+  if (Val == 0xC000) // -2.0
+    return 245;
+
+  if (Val == 0x4400) // 4.0
+    return 246;
+
+  if (Val == 0xC400) // -4.0
+    return 247;
+
+  if (Val == 0x3118 && // 1.0 / (2.0 * pi)
+      STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+    return 248;
+
+  return 255;
+}
+
 static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
   uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
   if (IntImm != 0)
@@ -160,7 +196,7 @@
 }
 
 uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
-                                         unsigned OpSize,
+                                         const MCOperandInfo &OpInfo,
                                          const MCSubtargetInfo &STI) const {
 
   int64_t Imm;
@@ -180,12 +216,16 @@
     Imm = MO.getImm();
   }
 
-  if (OpSize == 4)
+  switch (AMDGPU::getOperandSize(OpInfo)) {
+  case 4:
     return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-
-  assert(OpSize == 8);
-
-  return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
+  case 8:
+    return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
+  case 2:
+    return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+  default:
+    llvm_unreachable("invalid operand size");
+  }
 }
 
 void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -212,12 +252,9 @@
     if (!AMDGPU::isSISrcOperand(Desc, i))
       continue;
 
-    int RCID = Desc.OpInfo[i].RegClass;
-    const MCRegisterClass &RC = MRI.getRegClass(RCID);
-
     // Is this operand a literal immediate?
     const MCOperand &Op = MI.getOperand(i);
-    if (getLitEncoding(Op, AMDGPU::getRegBitWidth(RC) / 8, STI) != 255)
+    if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255)
       continue;
 
     // Yes! Encode it
@@ -282,9 +319,7 @@
 
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
-    uint32_t Enc = getLitEncoding(MO,
-                                  AMDGPU::getRegOperandSize(&MRI, Desc, OpNo),
-                                  STI);
+    uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
     if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
       return Enc;
 
Index: lib/Target/AMDGPU/SIDefines.h
===================================================================
--- lib/Target/AMDGPU/SIDefines.h
+++ lib/Target/AMDGPU/SIDefines.h
@@ -54,17 +54,36 @@
 namespace AMDGPU {
   enum OperandType {
     /// Operands with register or 32-bit immediate
-    OPERAND_REG_IMM32_INT = MCOI::OPERAND_FIRST_TARGET,
-    OPERAND_REG_IMM32_FP,
+    OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
+    OPERAND_REG_IMM_INT64,
+    OPERAND_REG_IMM_INT16,
+    OPERAND_REG_IMM_FP32,
+    OPERAND_REG_IMM_FP64,
+    OPERAND_REG_IMM_FP16,
+
     /// Operands with register or inline constant
-    OPERAND_REG_INLINE_C_INT,
-    OPERAND_REG_INLINE_C_FP,
+    OPERAND_REG_INLINE_C_INT16,
+    OPERAND_REG_INLINE_C_INT32,
+    OPERAND_REG_INLINE_C_INT64,
+    OPERAND_REG_INLINE_C_FP16,
+    OPERAND_REG_INLINE_C_FP32,
+    OPERAND_REG_INLINE_C_FP64,
+
+    OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
+    OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
+
+    OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
+    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+
+    OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
+    OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
 
     // Operand for source modifiers for VOP instructions
     OPERAND_INPUT_MODS,
 
     /// Operand with 32-bit immediate that uses the constant bus.
-    OPERAND_KIMM32
+    OPERAND_KIMM32,
+    OPERAND_KIMM16
   };
 }
 }
Index: lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- lib/Target/AMDGPU/SIFoldOperands.cpp
+++ lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -308,12 +308,14 @@
     return;
   }
 
-  APInt Imm(64, OpToFold.getImm());
 
   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
   const TargetRegisterClass *FoldRC =
     TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
 
+  APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
+            OpToFold.getImm());
+
   // Split 64-bit constants into 32-bits for folding.
   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
     unsigned UseReg = UseOp.getReg();
@@ -322,6 +324,8 @@
       MRI.getRegClass(UseReg) :
       TRI.getPhysRegClass(UseReg);
 
+    assert(Imm.getBitWidth() == 64);
+
     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
       return;
 
@@ -498,7 +502,6 @@
       if (!isSafeToFold(MI))
         continue;
 
-      unsigned OpSize = TII->getOpSize(MI, 1);
       MachineOperand &OpToFold = MI.getOperand(1);
       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
 
@@ -552,14 +555,15 @@
                Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
              Use != E; ++Use) {
           MachineInstr *UseMI = Use->getParent();
+          unsigned OpNo = Use.getOperandNo();
 
-          if (TII->isInlineConstant(OpToFold, OpSize)) {
-            foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
+          if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
+            foldOperand(OpToFold, UseMI, OpNo, FoldList,
                         CopiesToReplace, TII, TRI, MRI);
           } else {
             if (++NumLiteralUses == 1) {
               NonInlineUse = &*Use;
-              NonInlineUseOpNo = Use.getOperandNo();
+              NonInlineUseOpNo = OpNo;
             }
           }
         }
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -454,15 +454,96 @@
     return !RI.isSGPRReg(MRI, Dest);
   }
 
+  static int operandBitWidth(uint8_t OperandType) {
+    switch (OperandType) {
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+      return 32;
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+      return 64;
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+      return 16;
+    default:
+      llvm_unreachable("unexpected operand type");
+    }
+  }
+
   bool isInlineConstant(const APInt &Imm) const;
-  bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
-  bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
+
+  bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
+
+  bool isInlineConstant(const MachineOperand &MO,
+                        const MCOperandInfo &OpInfo) const {
+    return isInlineConstant(MO, OpInfo.OperandType);
+  }
+
+  /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would
+  /// be an inline immediate.
+  bool isInlineConstant(const MachineInstr &MI,
+                        const MachineOperand &UseMO,
+                        const MachineOperand &DefMO) const {
+    assert(UseMO.getParent() == &MI);
+    int OpIdx = MI.getOperandNo(&UseMO);
+    if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands) {
+      return false;
+    }
+
+    return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]);
+  }
+
+  /// \p returns true if the operand \p OpIdx in \p MI is a valid inline
+  /// immediate.
+  bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+  }
+
+  bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx,
+                        const MachineOperand &MO) const {
+    if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands)
+      return false;
+
+    if (MI.isCopy()) {
+      unsigned Size = getOpSize(MI, OpIdx);
+      assert(Size == 8 || Size == 4);
+
+      uint8_t OpType = (Size == 8) ?
+        AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32;
+      return isInlineConstant(MO, OpType);
+    }
+
+    return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
+  }
+
+  bool isInlineConstant(const MachineOperand &MO) const {
+    const MachineInstr *Parent = MO.getParent();
+    return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
+  }
+
+  bool isLiteralConstant(const MachineOperand &MO,
+                         const MCOperandInfo &OpInfo) const {
+    return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType);
+  }
+
+  bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    return MO.isImm() && !isInlineConstant(MI, OpIdx);
+  }
 
   // Returns true if this operand could potentially require a 32-bit literal
   // operand, but not necessarily. A FrameIndex for example could resolve to an
   // inline immediate value that will not require an additional 4-bytes; this
   // assumes that it will.
-  bool isLiteralConstantLike(const MachineOperand &MO, unsigned OpSize) const;
+  bool isLiteralConstantLike(const MachineOperand &MO,
+                             const MCOperandInfo &OpInfo) const;
 
   bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
                          const MachineOperand &MO) const;
@@ -474,7 +555,7 @@
   /// \brief Returns true if this operand uses the constant bus.
   bool usesConstantBus(const MachineRegisterInfo &MRI,
                        const MachineOperand &MO,
-                       unsigned OpSize) const;
+                       const MCOperandInfo &OpInfo) const;
 
   /// \brief Return true if this instruction has any modifiers.
   ///  e.g. src[012]_mod, omod, clamp.
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1415,10 +1415,12 @@
     // If this is a free constant, there's no reason to do this.
     // TODO: We could fold this here instead of letting SIFoldOperands do it
     // later.
-    if (isInlineConstant(ImmOp, 4))
+    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
+
+    // Any src operand can be used for the legality check.
+    if (isInlineConstant(UseMI, *Src0, ImmOp))
       return false;
 
-    MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
@@ -1620,8 +1622,10 @@
   case AMDGPU::V_MAC_F16_e32:
     IsF16 = true;
   case AMDGPU::V_MAC_F32_e32: {
-    const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
-    if (Src0->isImm() && !isInlineConstant(*Src0, 4))
+    int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                             AMDGPU::OpName::src0);
+    const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
+    if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
       return nullptr;
     break;
   }
@@ -1680,46 +1684,55 @@
   case 64:
     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
+  case 16:
+    return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+                                        ST.hasInv2PiInlineImm());
   default:
     llvm_unreachable("invalid bitwidth");
   }
 }
 
 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
-                                   unsigned OpSize) const {
-  if (MO.isImm()) {
-    // MachineOperand provides no way to tell the true operand size, since it
-    // only records a 64-bit value. We need to know the size to determine if a
-    // 32-bit floating point immediate bit pattern is legal for an integer
-    // immediate. It would be for any 32-bit integer operand, but would not be
-    // for a 64-bit one.
-    switch (OpSize) {
-    case 4:
-      return AMDGPU::isInlinableLiteral32(static_cast<int32_t>(MO.getImm()),
-                                          ST.hasInv2PiInlineImm());
-    case 8:
-      return AMDGPU::isInlinableLiteral64(MO.getImm(),
-                                          ST.hasInv2PiInlineImm());
-    default:
-      llvm_unreachable("invalid bitwidth");
-    }
-  }
+                                   uint8_t OperandType) const {
+  if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+    return false;
 
-  return false;
-}
+  // MachineOperand provides no way to tell the true operand size, since it only
+  // records a 64-bit value. We need to know the size to determine if a 32-bit
+  // floating point immediate bit pattern is legal for an integer immediate. It
+  // would be for any 32-bit integer operand, but would not be for a 64-bit one.
 
-bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
-                                    unsigned OpSize) const {
-  return MO.isImm() && !isInlineConstant(MO, OpSize);
+  int64_t Imm = MO.getImm();
+  switch (operandBitWidth(OperandType)) {
+  case 32: {
+    int32_t Trunc = static_cast<int32_t>(Imm);
+    return Trunc == Imm &&
+           AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
+  }
+  case 64: {
+    return AMDGPU::isInlinableLiteral64(MO.getImm(),
+                                        ST.hasInv2PiInlineImm());
+  }
+  case 16: {
+    if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+      int16_t Trunc = static_cast<int16_t>(Imm);
+      return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+    }
+
+    return false;
+  }
+  default:
+    llvm_unreachable("invalid bitwidth");
+  }
 }
 
 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
-                                        unsigned OpSize) const {
+                                        const MCOperandInfo &OpInfo) const {
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
     return false;
   case MachineOperand::MO_Immediate:
-    return !isInlineConstant(MO, OpSize);
+    return !isInlineConstant(MO, OpInfo);
   case MachineOperand::MO_FrameIndex:
   case MachineOperand::MO_MachineBasicBlock:
   case MachineOperand::MO_ExternalSymbol:
@@ -1758,11 +1771,10 @@
   if (OpInfo.RegClass < 0)
     return false;
 
-  unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
-  if (isLiteralConstant(MO, OpSize))
-    return RI.opCanUseLiteralConstant(OpInfo.OperandType);
+  if (MO.isImm() && isInlineConstant(MO, OpInfo))
+    return RI.opCanUseInlineConstant(OpInfo.OperandType);
 
-  return RI.opCanUseInlineConstant(OpInfo.OperandType);
+  return RI.opCanUseLiteralConstant(OpInfo.OperandType);
 }
 
 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
@@ -1789,12 +1801,17 @@
 
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
                                   const MachineOperand &MO,
-                                  unsigned OpSize) const {
+                                  const MCOperandInfo &OpInfo) const {
   // Literal constants use the constant bus.
-  if (isLiteralConstant(MO, OpSize))
-    return true;
+  //if (isLiteralConstantLike(MO, OpInfo))
+  // return true;
+  if (MO.isImm())
+    return !isInlineConstant(MO, OpInfo);
 
-  if (!MO.isReg() || !MO.isUse())
+  if (!MO.isReg())
+    return true; // Misc other operands like FrameIndex
+
+  if (!MO.isUse())
     return false;
 
   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
@@ -1923,17 +1940,22 @@
         return false;
       }
       break;
-    case AMDGPU::OPERAND_REG_IMM32_INT:
-    case AMDGPU::OPERAND_REG_IMM32_FP:
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
       break;
-    case AMDGPU::OPERAND_REG_INLINE_C_INT:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP:
-      if (isLiteralConstant(MI.getOperand(i),
-                            RI.getRegClass(RegClass)->getSize())) {
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
+      const MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
       break;
+    }
     case MCOI::OPERAND_IMMEDIATE:
     case AMDGPU::OPERAND_KIMM32:
       // Check if this operand is an immediate.
@@ -1985,7 +2007,7 @@
       if (OpIdx == -1)
         break;
       const MachineOperand &MO = MI.getOperand(OpIdx);
-      if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
+      if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
         if (MO.isReg()) {
           if (MO.getReg() != SGPRUsed)
             ++ConstantBusCount;
@@ -2328,7 +2350,7 @@
   if (!MO)
     MO = &MI.getOperand(OpIdx);
 
-  if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
+  if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
 
     RegSubRegPair SGPRUsed;
     if (MO->isReg())
@@ -2340,7 +2362,7 @@
       const MachineOperand &Op = MI.getOperand(i);
       if (Op.isReg()) {
         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
-            usesConstantBus(MRI, Op, getOpSize(MI, i))) {
+            usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
           return false;
         }
       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
@@ -3537,14 +3559,14 @@
     if (Src0Idx == -1)
       return 4; // No operands.
 
-    if (isLiteralConstantLike(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
+    if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
       return 8;
 
     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
     if (Src1Idx == -1)
       return 4;
 
-    if (isLiteralConstantLike(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
+    if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
       return 8;
 
     return 4;
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -402,22 +402,30 @@
 
 } // End OperandType = "OPERAND_IMMEDIATE"
 
-
-// 32-bit VALU immediate operand that uses the constant bus.
-def KImmFP32MatchClass : AsmOperandClass {
-  let Name = "KImmFP32";
-  let PredicateMethod = "isKImmFP32";
+class KImmMatchClass<int size> : AsmOperandClass {
+  let Name = "KImmFP"#size;
+  let PredicateMethod = "isKImmFP"#size;
   let ParserMethod = "parseImm";
-  let RenderMethod = "addKImmFP32Operands";
+  let RenderMethod = "addKImmFP"#size#"Operands";
 }
 
-def f32kimm : Operand<i32> {
+class kimmOperand<ValueType vt> : Operand<vt> {
   let OperandNamespace = "AMDGPU";
-  let OperandType = "OPERAND_KIMM32";
-  let PrintMethod = "printU32ImmOperand";
-  let ParserMatchClass = KImmFP32MatchClass;
+  let OperandType = "OPERAND_KIMM"#vt.Size;
+  let PrintMethod = "printU"#vt.Size#"ImmOperand";
+  let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
 }
 
+// 32-bit VALU immediate operand that uses the constant bus.
+def KImmFP32MatchClass : KImmMatchClass<32>;
+def f32kimm : kimmOperand<i32>;
+
+// 32-bit VALU immediate operand with a 16-bit value that uses the
+// constant bus.
+def KImmFP16MatchClass : KImmMatchClass<16>;
+def f16kimm : kimmOperand<i16>;
+
+
 def VOPDstS64 : VOPDstOperand <SReg_64>;
 
 class FPInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -425,6 +433,7 @@
   let ParserMethod = "parseRegOrImmWithFPInputMods";
   let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
 }
+def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
 
@@ -437,6 +446,8 @@
 class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
+
+def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
 def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
 def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
 
@@ -577,8 +588,8 @@
              !if(!eq(VT.Value, f64.Value), 1,
              0)));
   RegisterOperand ret = !if(isFP,
-                            !if(!eq(VT.Size, 64), VSrc_f64, VSrc_f32),
-                            !if(!eq(VT.Size, 64), VSrc_b64, VSrc_b32));
+                            !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
+                            !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
 }
 
 // Returns the vreg register class to use for source operand given VT
@@ -605,8 +616,9 @@
         !if(!eq(VT.Value, i1.Value),
             SCSrc_b64,
             !if(isFP,
-                VCSrc_f32,
-                VCSrc_b32)
+                !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
+                !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
+            )
          )
 	   )
      );
@@ -639,7 +651,13 @@
                0)));
   Operand ret =  !if(!eq(VT.Size, 64),
                      !if(isFP, FP64InputMods, Int64InputMods),
-                     !if(isFP, FP32InputMods, Int32InputMods));
+                       !if(isFP,
+                         !if(!eq(VT.Value, f16.Value),
+                            FP16InputMods,
+                            FP32InputMods
+                          ),
+                         Int32InputMods)
+                     );
 }
 
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -106,9 +106,8 @@
 
 // 64-bit vector move instruction.  This is mainly used by the SIFoldOperands
 // pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)> {
-  let VALU = 1;
-}
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
+                                      (ins VSrc_b64:$src0)>;
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
 let usesCustomInserter = 1, SALU = 1 in {
Index: lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.h
+++ lib/Target/AMDGPU/SIRegisterInfo.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
@@ -135,12 +136,19 @@
 
   /// \returns True if operands defined with this operand type can accept
   /// a literal constant (i.e. any 32-bit immediate).
-  bool opCanUseLiteralConstant(unsigned OpType) const;
+  bool opCanUseLiteralConstant(unsigned OpType) const {
+    // TODO: 64-bit operands have extending behavior from 32-bit literal.
+    return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
+           OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
+  }
 
   /// \returns True if operands defined with this operand type can accept
   /// an inline constant. i.e. An integer value in the range (-16, 64) or
   /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
-  bool opCanUseInlineConstant(unsigned OpType) const;
+  bool opCanUseInlineConstant(unsigned OpType) const {
+    return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+           OpType <= AMDGPU::OPERAND_SRC_LAST;
+  }
 
   enum PreloadedValue {
     // SGPRS:
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1081,19 +1081,6 @@
   return getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
-bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
-  return OpType == AMDGPU::OPERAND_REG_IMM32_INT ||
-         OpType == AMDGPU::OPERAND_REG_IMM32_FP;
-}
-
-bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
-  if (opCanUseLiteralConstant(OpType))
-    return true;
-
-  return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
-         OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
-}
-
 // FIXME: Most of these are flexible with HSA and we don't need to reserve them
 // as input registers if unused. Whether the dispatch ptr is necessary should be
 // easy to detect from used intrinsics. Scratch setup is harder to know.
Index: lib/Target/AMDGPU/SIRegisterInfo.td
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.td
+++ lib/Target/AMDGPU/SIRegisterInfo.td
@@ -384,31 +384,43 @@
 
 multiclass SIRegOperand <string rc, string MatchName, string opType> {
   let OperandNamespace = "AMDGPU" in {
+    def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_INT16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
+      let DecoderMethod = "decodeOperand_VSrc16";
+    }
+
+    def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_FP16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
+      let DecoderMethod = "decodeOperand_VSrc16";
+    }
 
     def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
-      let OperandType = opType#"_INT";
+      let OperandType = opType#"_INT32";
       let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
     }
 
     def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
-      let OperandType = opType#"_FP";
+      let OperandType = opType#"_FP32";
       let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
     }
 
     def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
-      let OperandType = opType#"_INT";
+      let OperandType = opType#"_INT64";
       let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
     }
 
     def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
-      let OperandType = opType#"_FP";
+      let OperandType = opType#"_FP64";
       let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
     }
   }
 }
 
+// FIXME: 64-bit sources can sometimes use 32-bit constants.
 multiclass RegImmOperand <string rc, string MatchName>
-  : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM32">;
+  : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
 
 multiclass RegInlineOperand <string rc, string MatchName>
   : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp
===================================================================
--- lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -134,15 +134,14 @@
   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
-  MachineOperand &Src0 = MI.getOperand(Src0Idx);
 
   // Only one literal constant is allowed per instruction, so if src0 is a
   // literal constant then we can't do any folding.
-  if (Src0.isImm() &&
-      TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
+  if (TII->isLiteralConstant(MI, Src0Idx))
     return;
 
   // Try to fold Src0
+  MachineOperand &Src0 = MI.getOperand(Src0Idx);
   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
     unsigned Reg = Src0.getReg();
     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
@@ -184,11 +183,15 @@
 }
 
 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
-  return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
+  return isInt<16>(Src.getImm()) &&
+    !TII->isInlineConstant(*Src.getParent(),
+                           Src.getParent()->getOperandNo(&Src));
 }
 
 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
-  return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
+  return isUInt<16>(Src.getImm()) &&
+    !TII->isInlineConstant(*Src.getParent(),
+                           Src.getParent()->getOperandNo(&Src));
 }
 
 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
@@ -196,12 +199,12 @@
                                  bool &IsUnsigned) {
   if (isInt<16>(Src.getImm())) {
     IsUnsigned = false;
-    return !TII->isInlineConstant(Src, 4);
+    return !TII->isInlineConstant(Src);
   }
 
   if (isUInt<16>(Src.getImm())) {
     IsUnsigned = true;
-    return !TII->isInlineConstant(Src, 4);
+    return !TII->isInlineConstant(Src);
   }
 
   return false;
@@ -212,7 +215,7 @@
 static bool isReverseInlineImm(const SIInstrInfo *TII,
                                const MachineOperand &Src,
                                int32_t &ReverseImm) {
-  if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src, 4))
+  if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
     return false;
 
   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -13,6 +13,8 @@
 #include "AMDKernelCodeT.h"
 #include "llvm/IR/CallingConv.h"
 
+#include "SIDefines.h"
+
 #define GET_INSTRINFO_OPERAND_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 #undef GET_INSTRINFO_OPERAND_ENUM
@@ -167,6 +169,37 @@
 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
                            unsigned OpNo);
 
+LLVM_READNONE
+inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
+  switch (OpInfo.OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    return 4;
+
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    return 8;
+
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    return 2;
+
+  default:
+    llvm_unreachable("unhandled operand type");
+  }
+}
+
+LLVM_READNONE
+inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
+  return getOperandSize(Desc.OpInfo[OpNo]);
+}
+
 /// \brief Is this literal inlinable
 LLVM_READNONE
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
@@ -174,6 +207,8 @@
 LLVM_READNONE
 bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 
 } // end namespace AMDGPU
 } // end namespace llvm
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -329,25 +329,29 @@
 
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
-
-  return OpType == AMDGPU::OPERAND_REG_IMM32_INT ||
-         OpType == AMDGPU::OPERAND_REG_IMM32_FP ||
-         OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
-         OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
+  return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+         OpType <= AMDGPU::OPERAND_SRC_LAST;
 }
 
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
-
-  return OpType == AMDGPU::OPERAND_REG_IMM32_FP ||
-         OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
+  switch (OpType) {
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
-
-  return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
-         OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
+  return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
+         OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
 }
 
 // Avoid using MCRegisterClass::getSize, since that function will go away
@@ -413,6 +417,15 @@
   if (Literal >= -16 && Literal <= 64)
     return true;
 
+  // The actual type of the operand does not seem to matter as long
+  // as the bits match one of the inline immediate values.  For example:
+  //
+  // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
+  // so it is a legal inline immediate.
+  //
+  // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
+  // floating-point, so it is a legal inline immediate.
+
   uint32_t Val = static_cast<uint32_t>(Literal);
   return (Val == FloatToBits(0.0f)) ||
          (Val == FloatToBits(1.0f)) ||
@@ -426,6 +439,23 @@
          (Val == 0x3e22f983 && HasInv2Pi);
 }
 
+bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
+  assert(HasInv2Pi);
+
+  if (Literal >= -16 && Literal <= 64)
+    return true;
+
+  uint16_t Val = static_cast<uint16_t>(Literal);
+  return Val == 0x3C00 || // 1.0
+         Val == 0xBC00 || // -1.0
+         Val == 0x3800 || // 0.5
+         Val == 0xB800 || // -0.5
+         Val == 0x4000 || // 2.0
+         Val == 0xC000 || // -2.0
+         Val == 0x4400 || // 4.0
+         Val == 0xC400 || // -4.0
+         Val == 0x3118;   // 1/2pi
+}
 
 } // End namespace AMDGPU
 } // End namespace llvm
Index: lib/Target/AMDGPU/VOP2Instructions.td
===================================================================
--- lib/Target/AMDGPU/VOP2Instructions.td
+++ lib/Target/AMDGPU/VOP2Instructions.td
@@ -134,7 +134,8 @@
 }
 
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
-  field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, f32kimm:$imm);
+  field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+  field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
   field string Asm32 = "$vdst, $src0, $src1, $imm";
   field bit HasExt = 0;
 }
@@ -143,7 +144,8 @@
 def VOP_MADAK_F32 : VOP_MADAK <f32>;
 
 class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
-  field dag Ins32 = (ins VCSrc_f32:$src0, f32kimm:$imm, VGPR_32:$src1);
+  field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+  field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
   field string Asm32 = "$vdst, $src0, $imm, $src1";
   field bit HasExt = 0;
 }
Index: test/CodeGen/AMDGPU/br_cc.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/br_cc.f16.ll
+++ test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -40,13 +40,13 @@
 }
 
 ; GCN-LABEL: {{^}}br_cc_f16_imm_a
-; GCN: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}
+; SI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
-; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; VI:  v_cmp_nlt_f16_e32 vcc, 0.5, v[[B_F16]]
 ; GCN: s_cbranch_vccnz
 
 ; GCN: one{{$}}
@@ -76,13 +76,13 @@
 }
 
 ; GCN-LABEL: {{^}}br_cc_f16_imm_b
-; GCN: v_mov_b32_e32 v[[B_F16:[0-9]+]], {{0x37ff|0x3800}}{{$}}
+; SI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
-; VI:  v_cmp_ngt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
+; VI:  v_cmp_ngt_f16_e32 vcc, 0.5, v[[A_F16]]
 ; GCN: s_cbranch_vccnz
 
 ; GCN: one{{$}}
Index: test/CodeGen/AMDGPU/commute-compares.ll
===================================================================
--- test/CodeGen/AMDGPU/commute-compares.ll
+++ test/CodeGen/AMDGPU/commute-compares.ll
@@ -693,11 +693,16 @@
   ret void
 }
 
+
+; FIXME: Should be able to fold this frameindex
 ; Without commuting the frame index in the pre-regalloc run of
 ; SIShrinkInstructions, this was using the VOP3 compare.
 
 ; GCN-LABEL: {{^}}commute_frameindex:
-; GCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
 define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
Index: test/CodeGen/AMDGPU/fadd.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/fadd.f16.ll
+++ test/CodeGen/AMDGPU/fadd.f16.ll
@@ -29,7 +29,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
+; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @fadd_f16_imm_a(
@@ -48,7 +48,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[A_F16]]
+; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @fadd_f16_imm_b(
@@ -104,8 +104,8 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
+; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
@@ -132,8 +132,8 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x4000, v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x3c00, v[[A_F16_1]]
+; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
+; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
Index: test/CodeGen/AMDGPU/fmul.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/fmul.f16.ll
+++ test/CodeGen/AMDGPU/fmul.f16.ll
@@ -48,7 +48,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
+; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @fmul_f16_imm_b(
@@ -105,7 +105,7 @@
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
+; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
@@ -132,7 +132,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
+; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 ; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
Index: test/CodeGen/AMDGPU/fsub.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/fsub.f16.ll
+++ test/CodeGen/AMDGPU/fsub.f16.ll
@@ -29,7 +29,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
+; VI:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @fsub_f16_imm_a(
@@ -48,7 +48,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 0xc000, v[[A_F16]]
+; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @fsub_f16_imm_b(
@@ -104,8 +104,8 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
-; VI:  v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
+; VI:  v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI:  v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
@@ -132,8 +132,8 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0xc000, v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0xbc00, v[[A_F16_1]]
+; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
+; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
Index: test/CodeGen/AMDGPU/imm16.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/imm16.ll
@@ -0,0 +1,316 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+; FIXME: Merge into imm.ll
+
+; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+  store volatile i16 -32768, i16 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+  store half 0.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_imm_neg_0.0_f16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+  store half -0.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+  store half 0.5, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+  store half -0.5, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+  store half 1.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+  store half -1.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+  store half 2.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+  store half -2.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+  store half 4.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+  store half -4.0, half addrspace(1)* %out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+  store half 0xH3118, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f16:
+; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
+; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
+; GCN: buffer_store_short [[REG]]
+define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+  store half 0xHB118, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_literal_imm_f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
+; GCN: buffer_store_short [[REG]]
+define void @store_literal_imm_f16(half addrspace(1)* %out) {
+  store half 4096.0, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_0.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_0.5_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0.5
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, -0.5
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_1.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 1.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, -1.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_2.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 2.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, -2.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_4.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 4.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, -4.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
+; VI: buffer_store_short [[REG]]
+define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+  %x = load half, half addrspace(1)* %in
+  %y = fadd half %x, 0.5
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_add_literal_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
+; VI: buffer_store_short [[REG]]
+define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+  %x = load half, half addrspace(1)* %in
+  %y = fadd half %x, 1024.0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_1_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xH0001
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_2_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xH0002
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_16_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xH0010
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xHFFFF
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xHFFFE
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xHFFF0
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_63_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xH003F
+  store half %y, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_64_f16:
+; VI: buffer_load_ushort [[VAL:v[0-9]+]]
+; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
+; VI: buffer_store_short [[REG]]
+define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+  %y = fadd half %x, 0xH0040
+  store half %y, half addrspace(1)* %out
+  ret void
+}
Index: test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -20,7 +20,7 @@
 
 ; GCN-LABEL: {{^}}ldexp_f16_imm_a
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
-; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[B_I32]]
+; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
 define void @ldexp_f16_imm_a(
     half addrspace(1)* %r,
Index: test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -51,7 +51,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
+; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @maxnum_f16_imm_b(
@@ -108,7 +108,7 @@
 ; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
+; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
@@ -135,7 +135,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
+; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 ; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
Index: test/CodeGen/AMDGPU/llvm.minnum.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -51,7 +51,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
+; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define void @minnum_f16_imm_b(
@@ -108,7 +108,7 @@
 ; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 ; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
+; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
@@ -135,7 +135,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
+; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 ; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
Index: test/CodeGen/AMDGPU/select.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/select.f16.ll
+++ test/CodeGen/AMDGPU/select.f16.ll
@@ -45,8 +45,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
-; VI:  v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
+; VI:  v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
@@ -76,8 +75,7 @@
 ; SI:  v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
 ; SI:  v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
-; VI:  v_cmp_gt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
+; VI:  v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
Index: test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
===================================================================
--- /dev/null
+++ test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
@@ -0,0 +1,709 @@
+# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
+--- |
+  define void @add_f32_1.0_one_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_f32_1.0_multi_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f16.add1 = fadd half %f16.val1, 0xH3C00
+    %f32.add = fadd float %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f16.add1, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_i32_1_multi_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH0001
+    %f16.add1 = fadd half %f16.val1, 0xH0001
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f16.add1,half addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xHFFFE
+    %f16.add1 = fadd half %f16.val1, 0xHFFFE
+    %f32.add = fadd float %f32.val, 0xffffffffc0000000
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f16.add1, half addrspace(1)* undef
+    store volatile float %f32.add, float addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_f16_1.0_multi_f32_use() #0 {
+    %f32.val0 = load volatile float, float addrspace(1)* undef
+    %f32.val1 = load volatile float, float addrspace(1)* undef
+    %f32.val = load volatile float, float addrspace(1)* undef
+    %f32.add0 = fadd float %f32.val0, 1.0
+    %f32.add1 = fadd float %f32.val1, 1.0
+    store volatile float %f32.add0, float addrspace(1)* undef
+    store volatile float %f32.add1, float addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile half, half addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd half %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f32.add, half addrspace(1)* undef
+    ret void
+  }
+
+  define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+    %f16.val0 = load volatile half, half addrspace(1)* undef
+    %f16.val1 = load volatile half, half addrspace(1)* undef
+    %f32.val = load volatile half, half addrspace(1)* undef
+    %f16.add0 = fadd half %f16.val0, 0xH3C00
+    %f32.add = fadd half %f32.val, 1.000000e+00
+    store volatile half %f16.add0, half addrspace(1)* undef
+    store volatile half %f32.add, half addrspace(1)* undef
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+
+# f32 1.0 with a single use should be folded as the low 32-bits of a
+#  literal constant.
+
+# CHECK-LABEL: name: add_f32_1.0_one_f16_use
+# CHECK: %13 = V_ADD_F16_e32  1065353216, killed %11, implicit %exec
+
+name:            add_f32_1.0_one_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = V_MOV_B32_e32 1065353216, implicit %exec
+    %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+# Materialized f32 inline immediate should not be folded into the f16
+# operands
+
+# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
+# CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec
+
+
+name:            add_f32_1.0_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 1065353216, implicit %exec
+    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded into the single f32 use as an inline
+#  immediate, and folded into the single f16 use as a literal constant
+
+# CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use
+# CHECK: %15 = V_ADD_F16_e32 1065353216, %11, implicit %exec
+# CHECK: %16 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
+
+name:            add_f32_1.0_one_f32_use_one_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %14 = V_MOV_B32_e32 1065353216, implicit %exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+    %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded for the single f32 use as an inline
+#  constant, and not folded as a multi-use literal for the f16 cases
+
+# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
+# CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32  %14, %11, implicit %exec
+# CHECK: %16 = V_ADD_F16_e32 %14,  %12, implicit %exec
+# CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
+
+name:            add_f32_1.0_one_f32_use_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %14 = V_MOV_B32_e32 1065353216, implicit %exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
+    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+# CHECK-LABEL: name: add_i32_1_multi_f16_use
+# CHECK: %13 = V_MOV_B32_e32 1, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 1, killed %11, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 1, killed %12, implicit %exec
+
+
+name:            add_i32_1_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 1, implicit %exec
+    %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use
+# CHECK: %14 = V_MOV_B32_e32 -2, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 -2, %11, implicit %exec
+# CHECK: %16 = V_ADD_F16_e32 -2, %12, implicit %exec
+# CHECK: %17 = V_ADD_F32_e32 -2, killed %13, implicit %exec
+
+name:            add_i32_m2_one_f32_use_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %14 = V_MOV_B32_e32 -2, implicit %exec
+    %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
+    %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
+    %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# f32 1.0 should be folded for the single f32 use as an inline
+#  constant, and not folded as a multi-use literal for the f16 cases
+
+# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
+# CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec
+# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
+# CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec
+
+name:            add_f16_1.0_multi_f32_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 15360, implicit %exec
+    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# The low 16-bits are an inline immediate, but the high bits are junk
+# FIXME: Should be able to fold this
+
+# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
+# CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec
+# CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
+
+name:            add_f16_1.0_other_high_bits_multi_f16_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 80886784, implicit %exec
+    %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
+    BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
+---
+
+# FIXME: Should fold inline immediate into f16 and literal use into
+# f32 instruction.
+
+# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
+# CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec
+# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
+# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
+name:            add_f16_1.0_other_high_bits_use_f16_f32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_32 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64 }
+  - { id: 5, class: sreg_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    %4 = IMPLICIT_DEF
+    %5 = COPY %4.sub1
+    %6 = IMPLICIT_DEF
+    %7 = COPY %6.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
+    %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
+    %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
+    %13 = V_MOV_B32_e32 305413120, implicit %exec
+    %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
+    %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
+    BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
+    S_ENDPGM
+
+...
Index: test/MC/AMDGPU/literal16.s
===================================================================
--- /dev/null
+++ test/MC/AMDGPU/literal16.s
@@ -0,0 +1,148 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s
+
+v_add_f16 v1, 0, v2
+// VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0.0, v2
+// VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e]
+
+v_add_f16 v1, v2, 0
+// VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00]
+
+v_add_f16 v1, v2, 0.0
+// VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00]
+
+v_add_f16 v1, -0.0, v2
+// VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00]
+
+v_add_f16 v1, 1.0, v2
+// VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e]
+
+v_add_f16 v1, -1.0, v2
+// VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e]
+
+v_add_f16 v1, -0.5, v2
+// VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0.5, v2
+// VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e]
+
+v_add_f16 v1, 2.0, v2
+// VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e]
+
+v_add_f16 v1, -2.0, v2
+// VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e]
+
+v_add_f16 v1, 4.0, v2
+// VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e]
+
+v_add_f16 v1, -4.0, v2
+// VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0.15915494, v2
+// VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e]
+
+v_add_f16 v1, -0.15915494, v2
+// VI: v_add_f16_e32 v1, 0xb118, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x18,0xb1,0x00,0x00]
+
+v_add_f16 v1, -1, v2
+// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
+
+
+v_add_f16 v1, -2, v2
+// VI: v_add_f16_e32 v1, -2, v2 ; encoding: [0xc2,0x04,0x02,0x3e]
+
+v_add_f16 v1, -3, v2
+// VI: v_add_f16_e32 v1, -3, v2 ; encoding: [0xc3,0x04,0x02,0x3e]
+
+v_add_f16 v1, -16, v2
+// VI: v_add_f16_e32 v1, -16, v2 ; encoding: [0xd0,0x04,0x02,0x3e]
+
+v_add_f16 v1, 1, v2
+// VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e]
+
+v_add_f16 v1, 2, v2
+// VI: v_add_f16_e32 v1, 2, v2 ; encoding: [0x82,0x04,0x02,0x3e]
+
+v_add_f16 v1, 3, v2
+// VI: v_add_f16_e32 v1, 3, v2 ; encoding: [0x83,0x04,0x02,0x3e]
+
+v_add_f16 v1, 4, v2
+// VI: v_add_f16_e32 v1, 4, v2 ; encoding: [0x84,0x04,0x02,0x3e]
+
+v_add_f16 v1, 15, v2
+// VI: v_add_f16_e32 v1, 15, v2 ; encoding: [0x8f,0x04,0x02,0x3e]
+
+v_add_f16 v1, 16, v2
+// VI: v_add_f16_e32 v1, 16, v2 ; encoding: [0x90,0x04,0x02,0x3e]
+
+v_add_f16 v1, 63, v2
+// VI: v_add_f16_e32 v1, 63, v2 ; encoding: [0xbf,0x04,0x02,0x3e]
+
+v_add_f16 v1, 64, v2
+// VI: v_add_f16_e32 v1, 64, v2 ; encoding: [0xc0,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0x0001, v2
+// VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0xffff, v2
+// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
+
+v_add_f16 v1, -17, v2
+// VI: v_add_f16_e32 v1, 0xffef, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xef,0xff,0x00,0x00]
+
+v_add_f16 v1, 65, v2
+// VI: v_add_f16_e32 v1, 0x41, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x41,0x00,0x00,0x00]
+
+v_add_f16 v1, 0x3c00, v2
+// VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0xbc00, v2
+// VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0x3800, v2
+// VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0xb800, v2
+// VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0x4000, v2
+// VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0xc000, v2
+// VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0x4400, v2
+// VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0xc400, v2
+// VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e]
+
+v_add_f16 v1, 0x3118, v2
+// VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e]
+
+v_add_f16 v1, -32768, v2
+// VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00]
+
+v_add_f16 v1, 32767, v2
+// VI: v_add_f16_e32 v1, 0x7fff, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xff,0x7f,0x00,0x00]
+
+v_add_f16 v1, 65535, v2
+// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
+
+
+// K-constant
+v_madmk_f16 v1, v2, 0x4280, v3
+// VI: v_madmk_f16_e32 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
+
+v_madmk_f16 v1, v2, 1.0, v3
+// VI: v_madmk_f16_e32 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
+
+v_madmk_f16 v1, v2, 1, v3
+// VI: v_madmk_f16_e32 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
+
+v_madmk_f16 v1, v2, 64.0, v3
+// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
+
+
+v_add_f16_e32 v1, 64.0, v2
Index: test/MC/AMDGPU/vop2.s
===================================================================
--- test/MC/AMDGPU/vop2.s
+++ test/MC/AMDGPU/vop2.s
@@ -422,12 +422,12 @@
 
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_madmk_f16 v1, v2, 64.0, v3
-// VI:     v_madmk_f16_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
+// VI:     v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
 v_madmk_f16 v1, v2, 64.0, v3
 
 // NOSICI: error: instruction not supported on this GPU
 // NOSICI: v_madak_f16 v1, v2, v3, 64.0
-// VI:     v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
+// VI:     v_madak_f16_e32 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
 v_madak_f16 v1, v2, v3, 64.0
 
 // NOSICI: error: instruction not supported on this GPU
Index: test/MC/Disassembler/AMDGPU/literal16_vi.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AMDGPU/literal16_vi.txt
@@ -0,0 +1,54 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding %s | FileCheck -check-prefix=VI %s
+
+# VI: v_add_f16_e32 v1, 0.5, v3 ; encoding: [0xf0,0x06,0x02,0x3e]
+0xf0 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, -0.5, v3 ; encoding: [0xf1,0x06,0x02,0x3e]
+0xf1 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x3e]
+0xf2 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x3e]
+0xf3 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, 2.0, v3 ; encoding: [0xf4,0x06,0x02,0x3e]
+0xf4 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, -2.0, v3 ; encoding: [0xf5,0x06,0x02,0x3e]
+0xf5 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, 4.0, v3 ; encoding: [0xf6,0x06,0x02,0x3e]
+0xf6 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, -4.0, v3 ; encoding: [0xf7,0x06,0x02,0x3e]
+0xf7 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, 0.15915494, v3 ; encoding: [0xf8,0x06,0x02,0x3e]
+0xf8 0x06 0x02 0x3e
+
+# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
+0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x00
+
+# VI: v_add_f16_e32 v1, 0x100, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x01,0x00,0x00]
+0xff 0x06 0x02 0x3e 0x00 0x01 0x00 0x00
+
+# non-zero unused bits in constant
+# VI: v_add_f16_e32 v1, 0x10041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x01,0x00]
+0xff 0x06 0x02 0x3e 0x41 0x00 0x01 0x00
+
+# VI: v_add_f16_e32 v1, 0x1000041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x01]
+0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
+
+# FIXME: This should be able to round trip with literal after instruction
+# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
+0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
+
+# VI: v_madmk_f16_e32 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
+0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x00
+
+# VI: v_madmk_f16_e32 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
+0x02 0x07 0x02 0x48 0x41 0x00 0x01 0x00
+
+# VI: v_madmk_f16_e32 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
+0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x01
Index: test/MC/Disassembler/AMDGPU/missing_op.txt
===================================================================
--- test/MC/Disassembler/AMDGPU/missing_op.txt
+++ test/MC/Disassembler/AMDGPU/missing_op.txt
@@ -1,5 +1,5 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=fiji -disassemble < %s | FileCheck %s -check-prefix=VI
 
 #TODO: this test will fail when we fix v_interp_p2_f32 signature, remove it then
-#VI: v_interp_p2_f32 16, [/*Missing OP1*/], /*Missing OP2*/, /*Missing OP3*/, /*Missing OP4*/
+#VI: v_interp_p2_f32 v7, [v7], /*invalid immediate*/, /*Missing OP3*/, /*Missing OP4*/
 0xd4 0x41 0x1d 0xd4
Index: test/MC/Disassembler/AMDGPU/vop1.txt
===================================================================
--- test/MC/Disassembler/AMDGPU/vop1.txt
+++ test/MC/Disassembler/AMDGPU/vop1.txt
@@ -246,5 +246,5 @@
 # CHECK: v_cvt_f16_i16_e32 v123, 0x21c2  ; encoding: [0xff,0x74,0xf6,0x7e,0xc2,0x21,0x00,0x00]
 0xff 0x74 0xf6 0x7e 0xc2 0x21 0x00 0x00
 
-# CHECK: v_cvt_u16_f16_e32 v123, 0x3f200000 ; encoding: [0xff,0x76,0xf6,0x7e,0x00,0x00,0x20,0x3f]
-0xff 0x76 0xf6 0x7e 0x00 0x00 0x20 0x3f
\ No newline at end of file
+# CHECK: v_cvt_u16_f16_e32 v123, 0x3f20 ; encoding: [0xff,0x76,0xf6,0x7e,0x20,0x3f,0x00,0x00]
+0xff 0x76 0xf6 0x7e 0x20 0x3f 0x00 0x00