Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3405,26 +3405,119 @@
   }};
 }
 
+static Register stripBitcast(const MachineRegisterInfo &MRI, Register Src) {
+  const MachineInstr *MI = MRI.getVRegDef(Src);
+  return MI->getOpcode() == AMDGPU::G_BITCAST ? MI->getOperand(1).getReg()
+                                              : Src;
+}
+
+static bool isShiftHiToLo(const MachineRegisterInfo &MRI, Register In,
+                          Register &Out) {
+  Register Tmp;
+  if (mi_match(In, MRI, m_GLShr(m_Reg(Tmp), m_SpecificICst(16)))) {
+    Out = Tmp;
+    return true;
+  }
+  return false;
+}
+
 std::pair<Register, unsigned>
 AMDGPUInstructionSelector::selectVOP3PModsImpl(
   Register Src, const MachineRegisterInfo &MRI) const {
-  unsigned Mods = 0;
-  MachineInstr *MI = MRI.getVRegDef(Src);
+  unsigned Mods = SISrcMods::OP_SEL_1; // default
+
+  // Instructions that can be folded into source modifiers can appear arbitrary
+  // number of times and in arbitrary order.
+  Register OldSrc;
+  while (OldSrc != Src) {
+    OldSrc = Src;
+
+    Register LoSrc, HiSrc;
+    MachineInstr *MI = MRI.getVRegDef(Src);
+    const unsigned Opcode = MI->getOpcode();
+
+    if (Opcode == AMDGPU::G_FNEG &&
+        // It's possible to see an f32 fneg here, but unlikely.
+        // TODO: Treat f32 fneg as only high bit.
+        MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
+      Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+      Src = MI->getOperand(1).getReg();
+    } else if (Opcode == AMDGPU::G_BITCAST) {
+      // Strip bitcast
+      Src = MI->getOperand(1).getReg();
+    } else if (Opcode == AMDGPU::G_SHUFFLE_VECTOR) {
+      // Check if <2 x 16s> vector is shuffled and update opsel modifiers.
+      ArrayRef<int> ShuffleMask = MI->getOperand(3).getShuffleMask();
+      if (ShuffleMask.size() == 2 &&
+          MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
+        Register Vec1 = MI->getOperand(1).getReg();
+        Register Vec2 = MI->getOperand(2).getReg();
+        Register NewSrc;
+        if ((ShuffleMask[0] < 2 && ShuffleMask[1] < 2) || (Vec1 == Vec2))
+          NewSrc = Vec1;
+        else if (ShuffleMask[0] > 1 && ShuffleMask[1] > 1)
+          NewSrc = Vec2;
+
+        if (NewSrc) {
+          if (ShuffleMask[0] & 1)
+            Mods ^= SISrcMods::OP_SEL_0;
+          if (!(ShuffleMask[1] & 1))
+            Mods ^= SISrcMods::OP_SEL_1;
+          Src = NewSrc;
+        }
+      }
+    } else if (mi_match(Src, MRI,
+                        m_GOr(m_GAnd(m_Reg(LoSrc), m_SpecificICst(0xffff)),
+                              m_GShl(m_Reg(HiSrc), m_SpecificICst(16))))) {
+      // LoSrc and HiSrc represent s32 registers whose low 16 bits are used to
+      // form <2 x s16> operand in packed instruction.
+
+      bool NegLo = false, LoIsShifted = false;
+      bool LoIsImplicitDef =
+          MRI.getVRegDef(LoSrc)->getOpcode() == AMDGPU::G_IMPLICIT_DEF;
+      // Ignore LoSrc if it's undef.
+      if (!LoIsImplicitDef) {
+        // Check for fneg for 16bit float. Since LoSrc is s32 it will be wrapped
+        // with anyext and trunc.
+        NegLo =
+            mi_match(LoSrc, MRI, m_GAnyExt(m_GFNeg(m_GTrunc(m_Reg(LoSrc)))));
+        // Check if high 16 bits of LoSrc are used instead.
+        LoIsShifted = isShiftHiToLo(MRI, LoSrc, LoSrc);
+        // Lookthrough <2 x s16> to s32 bitcast.
+        LoSrc = stripBitcast(MRI, LoSrc);
+        // Potential fneg on <2 x s16> will not be wrapped with anyext and trunc
+        // like above.
+        NegLo ^= mi_match(LoSrc, MRI, m_GFNeg(m_Reg(LoSrc)));
+      }
 
-  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
-      // It's possible to see an f32 fneg here, but unlikely.
-      // TODO: Treat f32 fneg as only high bit.
-      MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
-    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
-    Src = MI->getOperand(1).getReg();
-    MI = MRI.getVRegDef(Src);
-  }
+      bool NegHi = false, HiIsShifted = true;
+      bool HiIsImplicitDef =
+          MRI.getVRegDef(HiSrc)->getOpcode() == AMDGPU::G_IMPLICIT_DEF;
+      if (!HiIsImplicitDef) {
+        NegHi =
+            mi_match(HiSrc, MRI, m_GAnyExt(m_GFNeg(m_GTrunc(m_Reg(HiSrc)))));
+        HiIsShifted = isShiftHiToLo(MRI, HiSrc, HiSrc);
+        HiSrc = stripBitcast(MRI, HiSrc);
+        NegHi ^= mi_match(HiSrc, MRI, m_GFNeg(m_Reg(HiSrc)));
+      }
 
-  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+      // If LoSrc and HiSrc are same register (or at least one is undef) we can
+      // use it as new source with appropriate source modifiers.
+      if (LoIsImplicitDef || HiIsImplicitDef || LoSrc == HiSrc) {
+        if (NegLo)
+          Mods ^= SISrcMods::NEG;
+        if (LoIsShifted)
+          Mods ^= SISrcMods::OP_SEL_0;
+        if (NegHi)
+          Mods ^= SISrcMods::NEG_HI;
+        if (!HiIsShifted)
+          Mods ^= SISrcMods::OP_SEL_1;
+        Src = LoIsImplicitDef ? HiSrc : LoSrc;
+      }
+    }
+  }
 
   // Packed instructions do not have abs modifiers.
-  Mods |= SISrcMods::OP_SEL_1;
-
   return std::make_pair(Src, Mods);
 }
 
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -786,229 +786,117 @@
 ; GFX9-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-NEXT:    v_and_or_b32 v2, v2, v9, v6
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX9-NEXT:    v_and_or_b32 v3, v3, v9, s4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_and_or_b32 v3, v4, v9, v3
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v9, v2
-; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0
-; GFX9-NEXT:    v_and_or_b32 v4, v5, v9, s4
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v9, s4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_pk_add_f16 v1, v4, v1
+; GFX9-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_and_or_b32 v0, v0, v9, v2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, v9, s4
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-CONTRACT-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-CONTRACT-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX9-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-CONTRACT-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX9-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v2, v2, v9, v6
-; GFX9-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v4, v4, v9, v6
-; GFX9-CONTRACT-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v3, v3, v9, s4
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v5, v5, v9, s4
 ; GFX9-CONTRACT-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-CONTRACT-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v9, v2
-; GFX9-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v9, s4
+; GFX9-CONTRACT-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v3, s4
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-DENORM-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-DENORM-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX9-DENORM-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-DENORM-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX9-DENORM-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-DENORM-NEXT:    v_and_or_b32 v2, v2, v9, v6
-; GFX9-DENORM-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-DENORM-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX9-DENORM-NEXT:    v_and_or_b32 v3, v3, v9, s4
-; GFX9-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; GFX9-DENORM-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-DENORM-NEXT:    v_and_or_b32 v3, v4, v9, v3
-; GFX9-DENORM-NEXT:    v_and_or_b32 v0, v0, v9, v2
-; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v3, v0
-; GFX9-DENORM-NEXT:    v_and_or_b32 v4, v5, v9, s4
-; GFX9-DENORM-NEXT:    v_and_or_b32 v1, v1, v9, s4
 ; GFX9-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v4, v1
+; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX9-DENORM-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-DENORM-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-DENORM-NEXT:    v_and_or_b32 v0, v0, v9, v2
-; GFX9-DENORM-NEXT:    v_and_or_b32 v1, v1, v9, s4
+; GFX9-DENORM-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-DENORM-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-DENORM-NEXT:    v_and_or_b32 v1, v1, v3, s4
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX9-UNSAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-UNSAFE-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX9-UNSAFE-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-UNSAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX9-UNSAFE-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v2, v2, v9, v6
-; GFX9-UNSAFE-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v4, v4, v9, v6
-; GFX9-UNSAFE-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v3, v3, v9, s4
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v5, v5, v9, s4
 ; GFX9-UNSAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-UNSAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v9, v2
-; GFX9-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v9, s4
+; GFX9-UNSAFE-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX9-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v3, s4
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-NEXT:    v_and_or_b32 v3, v3, v8, s4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v8, v6
-; GFX10-NEXT:    v_and_or_b32 v2, v2, v8, v7
-; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    v_pk_add_f16 v0, v4, v0
+; GFX10-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_or_b32 v1, v1, v3, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-NEXT:    v_and_or_b32 v2, v4, v8, v2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v8, v6
-; GFX10-NEXT:    v_pk_add_f16 v0, v2, v0
-; GFX10-NEXT:    v_and_or_b32 v2, v5, v8, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_pk_add_f16 v1, v2, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, v8, v3
+; GFX10-NEXT:    v_and_or_b32 v0, v0, v3, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-CONTRACT-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v2, v2, v9, v7
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v4, v4, v9, v8
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v2, v3, v9, s4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v4, v5, v9, s4
-; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v2, v4
-; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v9, v3
+; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-CONTRACT-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v1, v1, v4, s4
+; GFX10-CONTRACT-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-CONTRACT-NEXT:    v_and_or_b32 v0, v0, v4, v2
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-DENORM-NEXT:    v_mov_b32_e32 v8, 0xffff
-; GFX10-DENORM-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v3, v3, v8, s4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v8, v6
-; GFX10-DENORM-NEXT:    v_and_or_b32 v2, v2, v8, v7
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX10-DENORM-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
+; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v3, s4
 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-DENORM-NEXT:    v_and_or_b32 v2, v4, v8, v2
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v8, v6
-; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_and_or_b32 v2, v5, v8, s4
-; GFX10-DENORM-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v2, v1
-; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-DENORM-NEXT:    v_and_or_b32 v1, v1, v8, s4
-; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v8, v3
+; GFX10-DENORM-NEXT:    v_and_or_b32 v0, v0, v3, v2
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v9, 0xffff
-; GFX10-UNSAFE-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v9, v6
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v2, v2, v9, v7
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v4, v4, v9, v8
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v2, v3, v9, s4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v4, v5, v9, s4
-; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v2, v4
-; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v9, s4
-; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v9, v3
+; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-UNSAFE-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-UNSAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v1, v1, v4, s4
+; GFX10-UNSAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-UNSAFE-NEXT:    v_and_or_b32 v0, v0, v4, v2
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <3 x half> %x, %y
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -272,23 +272,20 @@
 ; GFX906-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg <2 x half> %c
   %cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -300,23 +297,20 @@
 ; GFX906-LABEL: v_sdot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_shuffle10_a:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_shuffle10_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -327,23 +321,20 @@
 ; GFX906-LABEL: v_sdot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_shuffle10_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_shuffle10_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -114,16 +114,14 @@
 ; GFX906-LABEL: v_sdot4_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -70,16 +70,14 @@
 ; GFX906-LABEL: v_sdot8_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -272,23 +272,20 @@
 ; GFX906-LABEL: v_udot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fnegv2f16_c:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fnegv2f16_c:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg <2 x half> %c
   %cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -300,23 +297,20 @@
 ; GFX906-LABEL: v_udot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_shuffle10_a:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_shuffle10_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -327,23 +321,20 @@
 ; GFX906-LABEL: v_udot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_shuffle10_b:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_shuffle10_b:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -114,16 +114,14 @@
 ; GFX906-LABEL: v_udot4_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot4_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -70,16 +70,14 @@
 ; GFX906-LABEL: v_udot8_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot8_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/packed-op-sel.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/packed-op-sel.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>)
+declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>)
+declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
+
+; flip elements by extracting and inserting
+define <2 x half> @insert_flip(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: insert_flip:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %vecz0 = extractelement <2 x half> %vecz, i32 0
+  %vecz1 = extractelement <2 x half> %vecz, i32 1
+  %newvecz0 = insertelement <2 x half> undef, half %vecz0, i32 1
+  %newvecz = insertelement <2 x half> %newvecz0, half %vecz1, i32 0
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %newvecz)
+  ret <2 x half> %res
+}
+
+; flip elements by extracting and inserting with various negates
+define <2 x half> @insert_flip_negate(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: insert_flip_negate:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %negz = fneg <2 x half> %vecz
+  %vecz0 = extractelement <2 x half> %negz, i32 1
+  %vecz1 = extractelement <2 x half> %vecz, i32 0
+  %negz0 = fneg half %vecz0
+  %negz1 = fneg half %vecz1
+  %newvecz0 = insertelement <2 x half> undef, half %negz0, i32 0
+  %newvecz = insertelement <2 x half> %newvecz0, half %negz1, i32 1
+  %negnewvecz = fneg <2 x half> %newvecz
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %negnewvecz)
+  ret <2 x half> %res
+}
+
+; make <2 x s16> vectors where either low or high part is undef
+define <2 x half> @insert_with_undef(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: insert_with_undef:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %vecx1 = extractelement <2 x half> %vecx, i32 1
+  %newvecx = insertelement <2 x half> undef, half %vecx1, i32 1
+
+  %vecy0 = extractelement <2 x half> %vecy, i32 0
+  %newvecy = insertelement <2 x half> undef, half %vecy0, i32 0
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %newvecx, <2 x half> %newvecy, <2 x half> %vecz)
+  ret <2 x half> %res
+}
+
+; multiple flips by extract/insert
+define <2 x half> @multi_insert_flip_negate(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: multi_insert_flip_negate:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %Ae0 = extractelement <2 x half> %vecz, i32 0
+  %Ae1 = extractelement <2 x half> %vecz, i32 1
+  %Avec0 = insertelement <2 x half> undef, half %Ae0, i32 0
+  %Avec = insertelement <2 x half> %Avec0, half %Ae1, i32 1
+
+  %Anegvec = fneg <2 x half> %Avec
+
+  %Be0 = extractelement <2 x half> %Anegvec, i32 0
+  %Be1 = extractelement <2 x half> %Anegvec, i32 1
+  %Bvec0 = insertelement <2 x half> undef, half %Be1, i32 0
+  %Bvec = insertelement <2 x half> %Bvec0, half %Be0, i32 1
+
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %Bvec)
+  ret <2 x half> %res
+}
+
+; shuffle elements
+define <2 x half> @shuffle_v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: shuffle_v2f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,0]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %shufflex = shufflevector <2 x half> %vecx, <2 x half> undef, <2 x i32> <i32 1, i32 1>
+  %shuffley = shufflevector <2 x half> undef, <2 x half> %vecy, <2 x i32> <i32 2, i32 3>
+  %shufflez = shufflevector <2 x half> %vecz, <2 x half> %vecz, <2 x i32> <i32 3, i32 0>
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %shufflex, <2 x half> %shuffley, <2 x half> %shufflez)
+  ret <2 x half> %res
+}
+
+; shuffle elements then negate
+define <2 x half> @shuffle_v2f16_negate_after(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: shuffle_v2f16_negate_after:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %negz = fneg <2 x half> %flipz
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %negz)
+  ret <2 x half> %res
+}
+
+; negate elements then shuffle
+define <2 x half> @shuffle_v2f16_negate_before(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: shuffle_v2f16_negate_before:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %negz = fneg <2 x half> %vecz
+  %flipz = shufflevector <2 x half> %negz, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipz)
+  ret <2 x half> %res
+}
+
+; consecutive shuffles
+define <2 x half> @multi_shuffle_v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: multi_shuffle_v2f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %flipzz = shufflevector <2 x half> %flipz, <2 x half> undef, <2 x i32> <i32 0, i32 0>
+  %flipzzz = shufflevector <2 x half> %flipzz, <2 x half> undef, <2 x i32> <i32 0, i32 1>
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipzzz)
+  ret <2 x half> %res
+}
+
+; shuffle elements (shuffle vector for vectors with more than 2 elements does not use G_SHUFFLE_VECTOR)
+define <4 x half> @shuffle_v4f16(<4 x half> %vecx, <4 x half> %vecy, <4 x half> %vecz) {
+; CHECK-LABEL: shuffle_v4f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; CHECK-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 op_sel_hi:[1,1,0]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %flipz = shufflevector <4 x half> %vecz, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
+  %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %vecx, <4 x half> %vecy, <4 x half> %flipz)
+  ret <4 x half> %res
+}
+
+; shuffle while picking elements from different arguments but arguments are identical
+define <2 x half> @shuffle_v2f16_same_source(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) {
+; CHECK-LABEL: shuffle_v2f16_same_source:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,0,1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %flipX = shufflevector <2 x half> %vecx, <2 x half> %vecx, <2 x i32> <i32 1, i32 2>
+  %duplY = shufflevector <2 x half> %vecy, <2 x half> %vecy, <2 x i32> <i32 0, i32 2>
+  %sameZ = shufflevector <2 x half> %vecz, <2 x half> %vecz, <2 x i32> <i32 0, i32 3>
+  %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %flipX, <2 x half> %duplY, <2 x half> %sameZ)
+  ret <2 x half> %res
+}
+
+; flip elements (implicit def case)
+define <3 x half> @insert_flip_v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %vecz) {
+; CHECK-LABEL: insert_flip_v3f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0xffff
+; CHECK-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; CHECK-NEXT:    s_lshl_b32 s4, s4, 16
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; CHECK-NEXT:    v_and_or_b32 v1, v1, v4, s4
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CHECK-NEXT:    v_and_or_b32 v0, v0, v4, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %vecz0 = extractelement <3 x half> %vecz, i32 0
+  %vecz1 = extractelement <3 x half> %vecz, i32 1
+  %vecz2 = extractelement <3 x half> %vecz, i32 2
+  %newvecz0 = insertelement <3 x half> undef,     half %vecz0, i32 1
+  %newvecz1 = insertelement <3 x half> %newvecz0, half %vecz1, i32 0
+  %newvecz  = insertelement <3 x half> %newvecz1, half %vecz2, i32 2
+  %res = call <3 x half> @llvm.fma.v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %newvecz)
+  ret <3 x half> %res
+}
+
+; shuffle elements (implicit def case)
+define <3 x half> @shuffle_v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %vecz) {
+; CHECK-LABEL: shuffle_v3f16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0]
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0xffff
+; CHECK-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; CHECK-NEXT:    s_lshl_b32 s4, s4, 16
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; CHECK-NEXT:    v_and_or_b32 v1, v1, v4, s4
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CHECK-NEXT:    v_and_or_b32 v0, v0, v4, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %flipz = shufflevector <3 x half> %vecz, <3 x half> %vecz, <3 x i32> <i32 1, i32 0, i32 2>
+  %res = call <3 x half> @llvm.fma.v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %flipz)
+  ret <3 x half> %res
+}