diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3322,6 +3322,131 @@
   }};
 }
 
+// Strip G_BITCAST and G_FNEG and return the source register of the result in
+// OutSrc. If any G_FNEG were encountered, xor the value in FNeg.
+static void stripBitcastAndFNeg(Register Src, Register &OutSrc,
+                                const MachineRegisterInfo &MRI, bool &FNeg) {
+  MachineInstr *MI = MRI.getVRegDef(Src);
+  while (MI->getOpcode() == AMDGPU::G_BITCAST ||
+         MI->getOpcode() == AMDGPU::G_FNEG) {
+    if (MI->getOpcode() == AMDGPU::G_FNEG)
+      FNeg ^= true;
+    OutSrc = MI->getOperand(1).getReg();
+    MI = MRI.getVRegDef(OutSrc);
+  }
+}
+
+// Figure out if the source register is actually the high 16-bit of another
+// register.
+static bool isHiElt(Register Src, Register &OutSrc,
+                    const MachineRegisterInfo &MRI, bool &FNeg) {
+  bool CurrentFNeg = false;
+  stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg);
+  MachineInstr *MI = MRI.getVRegDef(Src);
+
+  // Strip EXT instructions
+  if (MI->getOpcode() == AMDGPU::G_ANYEXT ||
+      MI->getOpcode() == AMDGPU::G_ZEXT || MI->getOpcode() == AMDGPU::G_SEXT) {
+    Src = MI->getOperand(1).getReg();
+    MI = MRI.getVRegDef(Src);
+  }
+
+  // G_BITCAST or G_FNEG can appear after extensions so look to strip them
+  // before continuing.
+  stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg);
+  MI = MRI.getVRegDef(Src);
+
+  if (MI->getOpcode() != AMDGPU::G_TRUNC && MI->getOpcode() != AMDGPU::G_LSHR)
+    return false;
+
+  if (MI->getOpcode() == AMDGPU::G_TRUNC) {
+    Src = MI->getOperand(1).getReg();
+    MI = MRI.getVRegDef(Src);
+  }
+
+  if (MI->getOpcode() == AMDGPU::G_LSHR) {
+    Register ShiftAmt = MI->getOperand(2).getReg();
+    auto MaybeImmVal = getConstantVRegValWithLookThrough(ShiftAmt, MRI);
+    if (MaybeImmVal && MaybeImmVal->Value.getSExtValue() == 16) {
+      OutSrc = MI->getOperand(1).getReg();
+      FNeg ^= CurrentFNeg;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Determine if we are looking at the low 16-bit of a dword register.
+static void stripTruncLoElt(Register Src, Register &OutSrc,
+                              const MachineRegisterInfo &MRI, bool &FNeg) {
+  bool CurrentFNeg = false;
+  stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg);
+  MachineInstr *MI = MRI.getVRegDef(Src);
+
+  // Strip EXT instructions
+  if (MI->getOpcode() == AMDGPU::G_ANYEXT ||
+      MI->getOpcode() == AMDGPU::G_ZEXT || MI->getOpcode() == AMDGPU::G_SEXT) {
+    Src = MI->getOperand(1).getReg();
+    MI = MRI.getVRegDef(Src);
+  }
+
+  // G_BITCAST or G_FNEG can appear after extensions so look to strip them
+  // before continuing.
+  stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg);
+  MI = MRI.getVRegDef(Src);
+
+  if (MI->getOpcode() == AMDGPU::G_TRUNC) {
+    Register DstReg = MI->getOperand(1).getReg();
+    const LLT Ty = MRI.getType(DstReg);
+    if (Ty.getSizeInBits() == 32) {
+      FNeg ^= CurrentFNeg;
+      OutSrc = MI->getOperand(1).getReg();
+    }
+  }
+}
+
+// Determine if the instruction pattern matches that of a G_BUILD_VECTOR_TRUNC
+// that has been lowered. If so then return true and return the sources in
+// LoSrcOut and HiSrcOut.
+static bool isBuildVectorTrunc(MachineInstr *MI, Register &LoSrcOut,
+                               Register &HiSrcOut,
+                               const MachineRegisterInfo &MRI) {
+  // Strip G_BITCAST
+  Register Src = MI->getOperand(1).getReg();
+  MI = MRI.getVRegDef(Src);
+
+  // After G_BITCAST should be a G_OR
+  if (MI->getOpcode() == AMDGPU::G_OR) {
+    Register LoSrc = MI->getOperand(1).getReg();
+    Register HiSrc = MI->getOperand(2).getReg();
+
+    MachineInstr *LoMI = MRI.getVRegDef(LoSrc);
+    MachineInstr *HiMI = MRI.getVRegDef(HiSrc);
+
+    // G_OR operands should be result from a G_AND 0xffff (Lo) and a G_SHL 16
+    // (Hi)
+    if (LoMI->getOpcode() == AMDGPU::G_AND &&
+        HiMI->getOpcode() == AMDGPU::G_SHL) {
+      Register AndMask = LoMI->getOperand(2).getReg();
+      auto MaybeImmVal1 = getConstantVRegValWithLookThrough(AndMask, MRI);
+      Register ShiftAmt = HiMI->getOperand(2).getReg();
+      auto MaybeImmVal2 = getConstantVRegValWithLookThrough(ShiftAmt, MRI);
+
+      if (MaybeImmVal1 && MaybeImmVal1->Value.getSExtValue() == 0xffff &&
+          MaybeImmVal2 && MaybeImmVal2->Value.getSExtValue() == 16) {
+        // Pattern matches a G_BUILD_VECTOR_TRUNC, return source registers for
+        // Lo and Hi sources.
+        LoSrcOut = LoMI->getOperand(1).getReg();
+        HiSrcOut = HiMI->getOperand(1).getReg();
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 std::pair<Register, unsigned>
 AMDGPUInstructionSelector::selectVOP3PModsImpl(
   Register Src, const MachineRegisterInfo &MRI) const {
@@ -3337,7 +3462,114 @@
     MI = MRI.getVRegDef(Src);
   }
 
-  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+  Register VecSrc = Src;
+  unsigned VecMods = Mods;
+
+  SmallVector<int, 2> Mask = {0, 1};
+  // Match op_sel through G_SHUFFLE_VECTOR or set mask values if a possibiliy
+  // of a G_BUILD_VECTOR_TRUNC is detected.
+  if (MI->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
+    ArrayRef<int> ShufMask = MI->getOperand(3).getShuffleMask();
+    assert(ShufMask.size() == 2);
+    assert(ShufMask[0] != -1 && ShufMask[1] != -1);
+
+    // Set mask values for G_SHUFFLE_VECTOR
+    Mask[0] = ShufMask[0];
+    Mask[1] = ShufMask[1];
+
+    // Strip G_SHUFFLE_VECTOR
+    Src = MI->getOperand(1).getReg();
+    MI = MRI.getVRegDef(Src);
+
+    // Strip any G_FNEG before a potential G_BUILD_VECTOR_TRUNC
+    if (MI->getOpcode() == AMDGPU::G_FNEG) {
+      Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+      Src = MI->getOperand(1).getReg();
+      MI = MRI.getVRegDef(Src);
+    }
+
+    // Lowering of G_BUILD_VECTOR_TRUNC always insert a G_BITCAST. If we do not
+    // see it then do not look any further and just set op_sel based on the
+    // shuffle mask.
+    if (MI->getOpcode() != AMDGPU::G_BITCAST) {
+      // Add op_sel modifiers based on the shuffle mask.
+      if (Mask[0] == Mask[1] && Mask[0] == 1)
+        // ShuffleMask of (1,1). Both select are for high 16 bit
+        Mods |= (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
+      else if (Mask[0] == 1)
+        // ShuffleMask of (1,0). First select is for high 16 bit while second
+        // select is for low 16 bit
+        Mods |= SISrcMods::OP_SEL_0;
+      else
+        // ShuffleMask of (0,1). First select is for low 16 bit while second
+        // select is for high 16 bit
+        Mods |= SISrcMods::OP_SEL_1;
+
+      return std::make_pair(Src, Mods);
+    }
+  }
+
+  // Match op_sel through G_BUILD_VECTOR_TRUNC which always insert a G_BITCAST
+  if (MI && MI->getOpcode() == AMDGPU::G_BITCAST) {
+    Register LoSrc;
+    Register HiSrc;
+    // Look for the pattern of G_BUILD_VECTOR_TRUNC and return the registers of
+    // the source elements
+    if (isBuildVectorTrunc(MI, LoSrc, HiSrc, MRI)) {
+      // In case of G_SHUFFLE_VECTOR, use the mask to select the Lo and Hi MIs.
+      // Default mask is (0,1).
+      MachineInstr *LoMI =
+          Mask[0] == 0 ? MRI.getVRegDef(LoSrc) : MRI.getVRegDef(HiSrc);
+      MachineInstr *HiMI =
+          Mask[1] == 1 ? MRI.getVRegDef(HiSrc) : MRI.getVRegDef(LoSrc);
+
+      // Update Lo and Hi source registers
+      LoSrc = LoMI->getOperand(0).getReg();
+      HiSrc = HiMI->getOperand(0).getReg();
+
+      // Keep track of G_FNEG modifiers when we strip them.
+      bool FNegLo = false;
+      bool FNegHi = false;
+
+      // Determine if LoSrc is actually from a high 16-bit source
+      if (isHiElt(LoSrc, LoSrc, MRI, FNegLo))
+        Mods |= SISrcMods::OP_SEL_0;
+
+      // Determine if HiSrc is actually from a high 16-bit source
+      if (isHiElt(HiSrc, HiSrc, MRI, FNegHi))
+        Mods |= SISrcMods::OP_SEL_1;
+
+      stripTruncLoElt(LoSrc, LoSrc, MRI, FNegLo);
+
+      stripTruncLoElt(HiSrc, HiSrc, MRI, FNegHi);
+
+      // Final strip of G_BITCASTs and G_FNEGs
+      stripBitcastAndFNeg(LoSrc, LoSrc, MRI, FNegLo);
+      stripBitcastAndFNeg(HiSrc, HiSrc, MRI, FNegHi);
+
+      // Apply G_FNEG modifiers
+      if (FNegLo)
+        Mods ^= SISrcMods::NEG;
+      if (FNegHi)
+        Mods ^= SISrcMods::NEG_HI;
+
+      LoMI = MRI.getVRegDef(LoSrc);
+      HiMI = MRI.getVRegDef(HiSrc);
+      if (LoMI == HiMI) {
+        Src = LoSrc;
+        return std::make_pair(Src, Mods);
+      }
+    } else {
+      bool FNeg = false;
+      stripBitcastAndFNeg(Src, Src, MRI, FNeg);
+      if (FNeg)
+        Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+
+      return std::make_pair(Src, Mods);
+    }
+  }
+  Src = VecSrc;
+  Mods = VecMods;
 
   // Packed instructions do not have abs modifiers.
   Mods |= SISrcMods::OP_SEL_1;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -272,23 +272,20 @@
 ; GFX906-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg <2 x half> %c
   %cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -300,23 +297,20 @@
 ; GFX906-LABEL: v_sdot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_shuffle10_a:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_shuffle10_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -327,23 +321,20 @@
 ; GFX906-LABEL: v_sdot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_shuffle10_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_shuffle10_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -114,16 +114,14 @@
 ; GFX906-LABEL: v_sdot4_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -70,16 +70,14 @@
 ; GFX906-LABEL: v_sdot8_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -272,23 +272,20 @@
 ; GFX906-LABEL: v_udot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fnegv2f16_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fnegv2f16_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg <2 x half> %c
   %cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -300,23 +297,20 @@
 ; GFX906-LABEL: v_udot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_shuffle10_a:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_shuffle10_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -327,23 +321,20 @@
 ; GFX906-LABEL: v_udot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_shuffle10_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_shuffle10_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -114,16 +114,14 @@
 ; GFX906-LABEL: v_udot4_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot4_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -70,16 +70,14 @@
 ; GFX906-LABEL: v_udot8_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot8_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll
@@ -0,0 +1,728 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+
+define amdgpu_kernel void @fma_vector_vector_scalar_lo_no_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_lo_no_shuffle:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %scalar0.vec1 = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.vec2 = insertelement <2 x half> %scalar0.vec1, half %scalar0, i32 1
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.vec2)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_lo:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+  %neg.scalar0.broadcast = fneg <2 x half> %scalar0.broadcast
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_scalar_lo:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fneg half %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fneg half %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+  %neg.neg.scalar0.broadcast = fneg <2 x half> %neg.scalar0.broadcast
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_neg_lo:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fneg half %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+  %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_neg_hi:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v2, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fneg half %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+; GCN-LABEL: add_vector_neg_bitcast_scalar_lo:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    ds_read_u16 v1, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %neg.scalar0 = fneg half %scalar0
+  %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
+
+  %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
+
+  %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_neg_vector_hi:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.fneg = fneg <2 x half> %vec2
+  %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_vector_neg_hi:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.elt1 = extractelement <2 x half> %vec2, i32 1
+  %neg.vec2.elt1 = fneg half %vec2.elt1
+
+  %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 {
+; GCN-LABEL: add_vector_scalar_hi:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4
+
+  %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
+  %result = add <2 x i16> %vec0, %vec1.elt1.broadcast
+
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_scalar_hi:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_neg_vector_lo_neg_hi:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %neg.vec2 = fneg <2 x half> %vec2
+  %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1
+  %neg.neg.vec2.elt1 = fneg half %neg.vec2.elt1
+  %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_swap_vector:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_swap_neg_vector:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fneg <2 x half> %vec2
+
+  %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_0:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fneg <2 x half> %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_1:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fneg <2 x half> %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_2:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fneg <2 x half> %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_3:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+  %neg.vec2 = fneg <2 x half> %vec2
+  %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1>
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined)
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: bitcast_fneg_f32:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_add_f16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %f32 = load volatile float, float addrspace(3)* undef, align 4
+  %neg.f32 = fneg float %f32
+  %bc = bitcast float %neg.f32 to <2 x half>
+  %result = fadd <2 x half> %vec0, %bc
+
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: shuffle_bitcast_fneg_f32:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_pk_add_f16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+
+  %f32 = load volatile float, float addrspace(3)* undef, align 4
+  %neg.f32 = fneg float %f32
+  %bc = bitcast float %neg.f32 to <2 x half>
+  %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %result = fadd <2 x half> %vec0, %shuf
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: bitcast_lo_elt_op_sel:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %scalar0 = load volatile i16, i16 addrspace(1)* undef
+  %shl = shl i16 %scalar0, 1
+  %shl.bc = bitcast i16 %shl to half
+
+  %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
+  %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 {
+; GCN-LABEL: mix_elt_types_op_sel:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    ds_read_b32 v2, v0 offset:4
+; GCN-NEXT:    ds_read_b32 v0, v0 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    global_load_ushort v3, v[0:1], off glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ; kill: killed $vgpr0_vgpr1
+; GCN-NEXT:    v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GCN-NEXT:    v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4
+
+  %scalar0 = load volatile i16, i16 addrspace(1)* undef
+  %scalar1 = load volatile half, half addrspace(1)* undef
+  %shl = shl i16 %scalar0, 1
+  %shl.bc = bitcast i16 %shl to half
+
+  %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0
+
+  %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0>
+  %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0>
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+