diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -273,6 +273,10 @@
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
+  bool isRegOrImmWithIntT16InputMods() const {
+    return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::i16);
+  }
+
   bool isRegOrImmWithInt32InputMods() const {
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
@@ -293,6 +297,10 @@
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
 
+  bool isRegOrImmWithFPT16InputMods() const {
+    return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::f16);
+  }
+
   bool isRegOrImmWithFP32InputMods() const {
     return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
@@ -512,7 +520,15 @@
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
   }
 
+  bool isVCSrcTB16() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::i16);
+  }
+
   bool isVCSrcTB16_Lo128() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::i16);
+  }
+
+  bool isVCSrcFake16B16_Lo128() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::i16);
   }
 
@@ -532,7 +548,15 @@
     return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
+  bool isVCSrcTF16() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16);
+  }
+
   bool isVCSrcTF16_Lo128() const {
+    return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16);
+  }
+
+  bool isVCSrcFake16F16_Lo128() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::f16);
   }
 
@@ -552,10 +576,16 @@
     return isVCSrcF64() || isLiteralImm(MVT::i64);
   }
 
+  bool isVSrcTB16() const { return isVCSrcTB16() || isLiteralImm(MVT::i16); }
+
   bool isVSrcTB16_Lo128() const {
     return isVCSrcTB16_Lo128() || isLiteralImm(MVT::i16);
   }
 
+  bool isVSrcFake16B16_Lo128() const {
+    return isVCSrcFake16B16_Lo128() || isLiteralImm(MVT::i16);
+  }
+
   bool isVSrcB16() const {
     return isVCSrcB16() || isLiteralImm(MVT::i16);
   }
@@ -588,10 +618,16 @@
     return isVCSrcF64() || isLiteralImm(MVT::f64);
   }
 
+  bool isVSrcTF16() const { return isVCSrcTF16() || isLiteralImm(MVT::f16); }
+
   bool isVSrcTF16_Lo128() const {
     return isVCSrcTF16_Lo128() || isLiteralImm(MVT::f16);
   }
 
+  bool isVSrcFake16F16_Lo128() const {
+    return isVCSrcFake16F16_Lo128() || isLiteralImm(MVT::f16);
+  }
+
   bool isVSrcF16() const {
     return isVCSrcF16() || isLiteralImm(MVT::f16);
   }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -234,6 +234,12 @@
                         bool MandatoryLiteral = false,
                         unsigned ImmWidth = 0) const;
 
+  MCOperand decodeVGPR_16(unsigned Val) const;
+  MCOperand decodeVGPR_16_Lo128(unsigned Val) const;
+  MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
+                               bool MandatoryLiteral = false,
+                               unsigned ImmWidth = 0) const;
+
   MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -260,6 +260,76 @@
 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
 DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
 
+inline MCOperand AMDGPUDisassembler::decodeVGPR_16(unsigned Val) const {
+  // Move the suffix bit from pos 9 to pos 0.
+  return createRegOperand(AMDGPU::VGPR_16RegClassID,
+                          ((Val & 255) << 1) | (Val >> 9));
+}
+
+static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
+                                               uint64_t /*Addr*/,
+                                               const MCDisassembler *Decoder) {
+  // Imm{0-7} is 8-bit VGPR number like for VGPR_32 and Imm{9} is
+  // opsel_lo for dst and acts like a True16 modifier (.h or .l).
+  // Imm{8} is not used.
+  assert(isUInt<10>(Imm) && "10-bit encoding expected");
+  assert((Imm & (1 << 8)) == 0 && "Imm{8} should not be used");
+
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeVGPR_16(Imm));
+}
+
+inline MCOperand AMDGPUDisassembler::decodeVGPR_16_Lo128(unsigned Val) const {
+  // Move the suffix bit from pos 7 to pos 0.
+  assert(isUInt<8>(Val));
+  return createRegOperand(AMDGPU::VGPR_16_Lo128RegClassID,
+                          ((Val & 127) << 1) | (Val >> 7));
+}
+
+static DecodeStatus
+DecodeVGPR_16_Lo128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,
+                                 const MCDisassembler *Decoder) {
+  // This uses 8-bit encoding but instead of being 8-bit VGPR number
+  // like for VGPR_32 this is 7-bit VGPR number and Imm{7} is
+  // True16 modifier (.h or .l). Used on instructions without opsel.
+  assert(isUInt<8>(Imm) && "8-bit encoding expected");
+
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeVGPR_16_Lo128(Imm));
+}
+
+static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
+                                                uint64_t /*Addr*/,
+                                                const MCDisassembler *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  assert(isUInt<9>(Imm) && "9-bit encoding expected");
+
+  if (Imm & AMDGPU::EncValues::IS_VGPR) {
+    // When Imm{8} is set (IS_VGPR), Imm{0-7} corresponds to vgpr number.
+    // Here Imm{0-6} is vgpr number Imm{7} is True16 modifier (.h or .l).
+    // Note: instructions that use this don't have opsel.
+    return addOperand(Inst, DAsm->decodeVGPR_16_Lo128(Imm & 0xFF));
+  }
+  return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16,
+                                                   Imm & 0xFF, false, 16));
+}
+
+static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
+                                          uint64_t /*Addr*/,
+                                          const MCDisassembler *Decoder) {
+  const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  assert(isUInt<10>(Imm) && "10-bit encoding expected");
+
+  if (Imm & AMDGPU::EncValues::IS_VGPR) {
+    // Imm{0-8} is standard 9-bit encoding for Src operand. Imm{9} is opsel and
+    // act as True16 modifier (.h or .l). Set Imm{8} to 0 to use decodeVGPR_16
+    // helper function.
+    return addOperand(Inst, DAsm->decodeVGPR_16(Imm & 0x2FF));
+  }
+  return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16,
+                                                   Imm & 0xFF, false, 16));
+}
+
 static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
                                          uint64_t Addr,
                                          const MCDisassembler *Decoder) {
@@ -1406,6 +1476,52 @@
     return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
   }
 
+  if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
+    return decodeIntImmed(Val);
+
+  if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
+    return decodeFPImmed(ImmWidth, Val);
+
+  if (Val == LITERAL_CONST) {
+    if (MandatoryLiteral)
+      // Keep a sentinel value for deferred setting.
+      return MCOperand::createImm(LITERAL_CONST);
+    else
+      return decodeLiteralConstant();
+  }
+
+  switch (Width) {
+  case OPW32:
+  case OPW16:
+  case OPWV216:
+    return decodeSpecialReg32(Val);
+  case OPW64:
+  case OPWV232:
+    return decodeSpecialReg64(Val);
+  default:
+    llvm_unreachable("unexpected immediate type");
+  }
+}
+
+MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
+                                                 unsigned Val,
+                                                 bool MandatoryLiteral,
+                                                 unsigned ImmWidth) const {
+  // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
+  // decoded earlier.
+  assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
+  using namespace AMDGPU::EncValues;
+
+  if (Val <= SGPR_MAX) {
+    // "SGPR_MIN <= Val" is always true and causes compilation warning.
+    static_assert(SGPR_MIN == 0);
+    return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
+  }
+
+  int TTmpIdx = getTTmpIdx(Val);
+  if (TTmpIdx >= 0)
+    return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
+
   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
     return decodeIntImmed(Val);
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -49,6 +49,14 @@
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const;
 
+  void getMachineOpValueT16(const MCInst &MI, unsigned OpNo, APInt &Op,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
+
+  void getMachineOpValueT16Lo128(const MCInst &MI, unsigned OpNo, APInt &Op,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   /// Use a fixup to encode the simm16 field for SOPP branch
   ///        instructions.
   void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
@@ -547,6 +555,72 @@
   getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
 }
 
+void AMDGPUMCCodeEmitter::getMachineOpValueT16(
+    const MCInst &MI, unsigned OpNo, APInt &Op,
+    SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg()) {
+    Op = MRI.getEncodingValue(MO.getReg());
+    return;
+  }
+
+  getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+
+  // VGPRs include the suffix/op_sel bit in the register encoding, but
+  // immediates and SGPRs include it in src_modifiers. Therefore, copy the
+  // op_sel bit from the src operands into src_modifier operands if Op is
+  // src_modifiers and the corresponding src is a VGPR.
+  unsigned OpSelBits = 0;
+  int SrcMOIdx = -1;
+  assert(OpNo < INT_MAX);
+  if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                              AMDGPU::OpName::src0_modifiers)) {
+    SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+    int VDstMOIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
+    if (VDstMOIdx != -1) {
+      auto DstVal = MRI.getEncodingValue(MI.getOperand(VDstMOIdx).getReg());
+      if (AMDGPU::isHi(DstVal, MRI))
+        OpSelBits |= SISrcMods::DST_OP_SEL;
+    }
+  } else if ((int)OpNo == AMDGPU::getNamedOperandIdx(
+                              MI.getOpcode(), AMDGPU::OpName::src1_modifiers))
+    SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+  else if ((int)OpNo == AMDGPU::getNamedOperandIdx(
+                            MI.getOpcode(), AMDGPU::OpName::src2_modifiers))
+    SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src2);
+  if (SrcMOIdx == -1)
+    return;
+
+  const MCOperand &SrcMO = MI.getOperand(SrcMOIdx);
+  if (!SrcMO.isReg())
+    return;
+
+  auto SrcReg = SrcMO.getReg();
+  if (AMDGPU::isSGPR(SrcReg, &MRI))
+    return;
+
+  if (AMDGPU::isHi(SrcReg, MRI))
+    OpSelBits |= SISrcMods::OP_SEL_0;
+  Op |= OpSelBits;
+}
+
+void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
+    const MCInst &MI, unsigned OpNo, APInt &Op,
+    SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg()) {
+    auto Encoding = MRI.getEncodingValue(MO.getReg());
+    if ((Encoding & (1 << 9))) { // isVGPR
+      assert((Encoding & (1 << 8)) == 0 && "Did not expect VGPR RegNo > 127");
+      Encoding = ((Encoding & 1) << 8) | Encoding;
+    }
+    Op = Encoding;
+    return;
+  }
+  getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+}
+
 void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
     const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
     SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -299,6 +299,16 @@
 
 class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
 
+def VOPDstOperand_t16 : VOPDstOperand <VGPR_16> {
+  let EncoderMethod = "getMachineOpValueT16";
+  let DecoderMethod = "DecodeVGPR_16RegisterClass";
+}
+
+def VOPDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
+  let EncoderMethod = "getMachineOpValueT16Lo128";
+  let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
+}
+
 class VINTRPe <bits<2> op> : Enc32 {
   bits<8> vdst;
   bits<8> vsrc;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1170,6 +1170,10 @@
 }
 
 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
+def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
+  let Name = "RegOrImmWithFPT16InputMods";
+  let PredicateMethod = "isRegOrImmWithFPT16InputMods";
+}
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
 
@@ -1187,6 +1191,7 @@
 }
 
 def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
+def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
 def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
 def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
 
@@ -1202,6 +1207,10 @@
   let Name = "RegOrInlineImmWithInt"#opSize#"InputMods";
   let PredicateMethod = "isRegOrInlineImmWithInt"#opSize#"InputMods";
 }
+def IntT16InputModsMatchClass : IntInputModsMatchClass<16> {
+  let Name = "RegOrImmWithIntT16InputMods";
+  let PredicateMethod = "isRegOrImmWithIntT16InputMods";
+}
 def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
 def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
 def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>;
@@ -1209,6 +1218,7 @@
 class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
+def IntT16InputMods : IntInputMods<IntT16InputModsMatchClass>;
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 def Int32VCSrcInputMods : IntInputMods<Int32VCSrcInputModsMatchClass>;
@@ -1454,15 +1464,18 @@
 
 // Returns the register class to use for the destination of VOP[123C]
 // instructions for the given VT.
-class getVALUDstForVT<ValueType VT> {
+class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
+  defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16,
+                                   VOPDstOperand_t16Lo128),
+                    VOPDstOperand<VGPR_32>);
   RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
                           !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
                             !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
-                              !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+                              !if(!eq(VT.Size, 16), op16,
                               VOPDstS64orS32)))); // else VT == i1
 }
 
-class getVALUDstForVT_t16<ValueType VT> {
+class getVALUDstForVT_not16<ValueType VT> {
   RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
                           !if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
                             !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
@@ -1480,7 +1493,7 @@
 
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
-class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> {
+class getVOPSrc0ForVT<ValueType VT, bit IsTrue16, bit IsFake16 = 1> {
   bit isFP = isFloatType<VT>.ret;
 
   RegisterOperand ret =
@@ -1489,7 +1502,7 @@
          VSrc_f64,
          !if(!eq(VT.Value, f16.Value),
             !if(IsTrue16,
-              VSrcT_f16_Lo128,
+              !if(IsFake16, VSrcFake16_f16_Lo128, VSrcT_f16_Lo128),
               VSrc_f16
             ),
             !if(!eq(VT.Value, v2f16.Value),
@@ -1505,7 +1518,7 @@
           VSrc_b64,
           !if(!eq(VT.Value, i16.Value),
             !if(IsTrue16,
-              VSrcT_b16_Lo128,
+              !if(IsFake16, VSrcFake16_b16_Lo128, VSrcT_b16_Lo128),
               VSrc_b16
             ),
              !if(!eq(VT.Value, v2i16.Value),
@@ -1530,12 +1543,13 @@
                               VGPR_32))));
 }
 
-class getVregSrcForVT_t16<ValueType VT> {
+class getVregSrcForVT_t16<ValueType VT, bit IsFake16 = 1> {
   RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
                         !if(!eq(VT.Size, 96), VReg_96,
                           !if(!eq(VT.Size, 64), VReg_64,
                             !if(!eq(VT.Size, 48), VReg_64,
-                              !if(!eq(VT.Size, 16), VGPR_32_Lo128,
+                              !if(!eq(VT.Size, 16),
+                                  !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128),
                                   VGPR_32)))));
 }
 
@@ -1548,7 +1562,7 @@
 
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
-class getVOP3SrcForVT<ValueType VT> {
+class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
   bit isFP = isFloatType<VT>.ret;
   RegisterOperand ret =
   !if(!eq(VT.Size, 128),
@@ -1565,7 +1579,7 @@
            SSrc_i1,
            !if(isFP,
               !if(!eq(VT.Value, f16.Value),
-                 VSrc_f16,
+                 !if(IsTrue16, VSrcT_f16, VSrc_f16),
                  !if(!eq(VT.Value, v2f16.Value),
                     VSrc_v2f16,
                     !if(!eq(VT.Value, v4f16.Value),
@@ -1575,7 +1589,7 @@
                  )
               ),
               !if(!eq(VT.Value, i16.Value),
-                 VSrc_b16,
+                 !if(IsTrue16, VSrcT_b16, VSrc_b16),
                  !if(!eq(VT.Value, v2i16.Value),
                     VSrc_v2b16,
                     VSrc_b32
@@ -1622,18 +1636,15 @@
 }
 
 // Return type of input modifiers operand for specified input operand
-class getSrcMod <ValueType VT> {
+class getSrcMod <ValueType VT, bit IsTrue16 = 0> {
   bit isFP = isFloatType<VT>.ret;
   bit isPacked = isPackedType<VT>.ret;
   Operand ret =  !if(!eq(VT.Size, 64),
                      !if(isFP, FP64InputMods, Int64InputMods),
-                       !if(isFP,
-                         !if(!eq(VT.Value, f16.Value),
-                            FP16InputMods,
-                            FP32InputMods
-                          ),
-                         Int32InputMods)
-                     );
+                     !if(!eq(VT.Size, 16),
+                         !if(isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods),
+                                   !if(IsTrue16, IntT16InputMods, IntOpSelMods)),
+                         !if(isFP, FP32InputMods, Int32InputMods)));
 }
 
 class getOpSelMod <ValueType VT> {
@@ -2447,7 +2458,7 @@
 class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let IsTrue16 = 1;
   // Most DstVT are 16-bit, but not all
-  let DstRC = getVALUDstForVT_t16<DstVT>.ret;
+  let DstRC = getVALUDstForVT_not16<DstVT>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
   let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
   let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
@@ -2462,7 +2473,7 @@
   let IsTrue16 = 1;
   let IsFake16 = 1;
   // Most DstVT are 16-bit, but not all
-  let DstRC = getVALUDstForVT_t16<DstVT>.ret;
+  let DstRC = getVALUDstForVT_not16<DstVT>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
   let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
   let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1130,6 +1130,30 @@
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
                      !subst("_f16", "F16", NAME), "_Imm16">;
 
+class RegOrB16T <string RegisterClass, string OperandTypePrefix>
+  : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16",
+                     !subst("_b16", "B16", NAME), "_Imm16"> {
+  let EncoderMethod = "getMachineOpValueT16";
+}
+
+class RegOrF16T <string RegisterClass, string OperandTypePrefix>
+  : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
+                     !subst("_f16", "F16", NAME), "_Imm16"> {
+  let EncoderMethod = "getMachineOpValueT16";
+}
+
+class RegOrB16_Lo128T <string RegisterClass, string OperandTypePrefix>
+  : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16",
+                     !subst("_b16_Lo128", "B16_Lo128", NAME), "_Imm16"> {
+  let EncoderMethod = "getMachineOpValueT16Lo128";
+}
+
+class RegOrF16_Lo128T <string RegisterClass, string OperandTypePrefix>
+  : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
+                     !subst("_f16_Lo128", "F16_Lo128", NAME), "_Imm16"> {
+  let EncoderMethod = "getMachineOpValueT16Lo128";
+}
+
 class RegOrB32 <string RegisterClass, string OperandTypePrefix>
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT32",
                      !subst("_b32", "B32", NAME), "_Imm32">;
@@ -1185,6 +1209,7 @@
   : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED",
                      !subst("_f16_Lo128_Deferred", "F16_Lo128", NAME),
                      "_Deferred_Imm16">;
+
 //===----------------------------------------------------------------------===//
 //  SSrc_* Operands with an SGPR or a 32-bit immediate
 //===----------------------------------------------------------------------===//
@@ -1208,6 +1233,24 @@
 
 def VSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_IMM">;
+
+def VSrcT_b16 : RegOrB16T <"VS_16", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrcT16";
+}
+def VSrcT_f16 : RegOrF16T <"VS_16", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrcT16";
+}
+
+def VSrcT_b16_Lo128 : RegOrB16_Lo128T <"VS_16_Lo128", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
+}
+def VSrcT_f16_Lo128 : RegOrF16_Lo128T <"VS_16_Lo128", "OPERAND_REG_IMM"> {
+  let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
+}
+
+def VSrcFake16_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
+def VSrcFake16_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
+
 def VSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">;
 def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">;
@@ -1217,9 +1260,6 @@
 def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">;
 def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">;
 
-def VSrcT_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
-def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
-
 //===----------------------------------------------------------------------===//
 //  VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use
 //  with FMAMK/FMAAK
@@ -1228,8 +1268,8 @@
 def VSrc_f16_Deferred : RegOrF16_Deferred<"VS_32", "OPERAND_REG_IMM">;
 def VSrc_f32_Deferred : RegOrF32_Deferred<"VS_32", "OPERAND_REG_IMM">;
 
-def VSrcT_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128",
-                                                       "OPERAND_REG_IMM">;
+def VSrcFake16_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128",
+                                                            "OPERAND_REG_IMM">;
 
 //===----------------------------------------------------------------------===//
 //  VRegSrc_* Operands with a VGPR
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1173,6 +1173,10 @@
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
+/// \returns if \p Reg occupies the high 16-bits of a 32-bit register.
+/// The bit indicating isHi is the LSB of the encoding.
+bool isHi(unsigned Reg, const MCRegisterInfo &MRI);
+
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2120,6 +2120,10 @@
     Reg == AMDGPU::SCC;
 }
 
+bool isHi(unsigned Reg, const MCRegisterInfo &MRI) {
+  return MRI.getEncodingValue(Reg) & 1;
+}
+
 #define MAP_REG2REG \
   using namespace AMDGPU; \
   switch(Reg) { \
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -381,7 +381,7 @@
 def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
   let IsTrue16 = 1;
   let DstRC = VOPDstOperand<VGPR_32_Lo128>;
-  let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm);
+  let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm);
 }
 def VOP_MADAK_F32 : VOP_MADAK <f32>;
 
@@ -406,7 +406,7 @@
 def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
   let IsTrue16 = 1;
   let DstRC = VOPDstOperand<VGPR_32_Lo128>;
-  let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1);
+  let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1);
 }
 def VOP_MADMK_F32 : VOP_MADMK <f32>;