Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -444,6 +444,30 @@ return BinaryOp_match(L, R); } +template struct Extract_match { + SrcTy L; + int64_t &CR; + + Extract_match(const SrcTy &LHS, int64_t &C) : L(LHS), CR(C) {} + template + bool match(const MachineRegisterInfo &MRI, OpTy &&Op) { + MachineInstr *TmpMI; + if (mi_match(Op, MRI, m_MInstr(TmpMI))) { + if (TmpMI->getOpcode() == TargetOpcode::G_EXTRACT && + TmpMI->getOperand(2).isImm()) { + CR = TmpMI->getOperand(2).getImm(); + return L.match(MRI, TmpMI->getOperand(1).getReg()); + } + } + return false; + } +}; + +template +inline Extract_match m_GExtract(const SrcTy &Src, int64_t &C) { + return Extract_match(Src, C); +} + // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc template struct UnaryOp_match { SrcTy L; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3385,25 +3385,131 @@ }}; } +static Register stripBitcast(const MachineRegisterInfo &MRI, Register Src) { + const MachineInstr *MI = MRI.getVRegDef(Src); + return (MI && MI->getOpcode() == AMDGPU::G_BITCAST) + ? MI->getOperand(1).getReg() + : Src; +} + +static Register stripExtract(const MachineRegisterInfo &MRI, Register Src) { + if (MRI.getType(Src).getSizeInBits() != 32) + return Src; + + MachineInstr *Extract; + int64_t Value; + + if (mi_match(Src, MRI, m_GExtract(m_MInstr(Extract), Value))) { + if (Extract->getOpcode() == AMDGPU::G_BITCAST) + Extract = MRI.getVRegDef(Extract->getOperand(1).getReg()); + + const int64_t ElementNo = Value / 32 + 1; + const unsigned Opcode = Extract->getOpcode(); + if (Opcode == AMDGPU::G_CONCAT_VECTORS || Opcode == AMDGPU::G_BUILD_VECTOR) + return Extract->getOperand(ElementNo).getReg(); + } + + return Src; +} + +static bool isShiftHiToLo(const MachineRegisterInfo &MRI, Register In, + Register &Out) { + Register Tmp; + if (mi_match(In, MRI, m_GLShr(m_Reg(Tmp), m_SpecificICst(16)))) { + Out = Tmp; + return true; + } + return false; +} + +static bool isImplicitDef(const MachineRegisterInfo &MRI, Register Src) { + const MachineInstr *MI = MRI.getVRegDef(Src); + return MI && MI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF; +} + std::pair AMDGPUInstructionSelector::selectVOP3PModsImpl( Register Src, const MachineRegisterInfo &MRI) const { unsigned Mods = 0; MachineInstr *MI = MRI.getVRegDef(Src); - if (MI && MI->getOpcode() == AMDGPU::G_FNEG && - // It's possible to see an f32 fneg here, but unlikely. - // TODO: Treat f32 fneg as only high bit. - MRI.getType(Src) == LLT::fixed_vector(2, 16)) { - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - Src = MI->getOperand(1).getReg(); - MI = MRI.getVRegDef(Src); - } + // Used to set op_sel and op_sel_hi + bool lowDstUsesLowSrc = true; + bool highDstUsesHighSrc = true; + + // Instructions that can be folded into source modifiers can appear arbitrary + // number of times and in arbitrary order. + Register OldSrc = 0; + while (OldSrc != Src) + { + OldSrc = Src; + + if (MI && MI->getOpcode() == AMDGPU::G_FNEG && + // It's possible to see an f32 fneg here, but unlikely. + // TODO: Treat f32 fneg as only high bit. + MRI.getType(Src) == LLT::fixed_vector(2, 16)) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } - // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + // Strip bitcast + if (MI && MI->getOpcode() == AMDGPU::G_BITCAST) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // Check if <2 x 16s> vector is shuffled and update opsel modifiers. + if (MI && MI->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { + const auto ShuffleMask = MI->getOperand(3).getShuffleMask(); + if (ShuffleMask.size() == 2) { + lowDstUsesLowSrc = (ShuffleMask[0] == 0) == lowDstUsesLowSrc; + highDstUsesHighSrc = (ShuffleMask[1] == 1) == highDstUsesHighSrc; + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + } + + // Get low and high parts that construct register used in packed instruction + // and check if they originate from same register so that opsel modifiers + // can be used instead. + Register LoSrc, HiSrc; + if (mi_match(Src, MRI, + m_GOr(m_GAnd(m_Reg(LoSrc), m_SpecificICst(0xffff)), + m_GShl(m_Reg(HiSrc), m_SpecificICst(16))))) { + if (mi_match(LoSrc, MRI, m_GAnyExt(m_GFNeg(m_GTrunc(m_Reg(LoSrc)))))) + Mods ^= SISrcMods::NEG; + lowDstUsesLowSrc = !isShiftHiToLo(MRI, LoSrc, LoSrc) == lowDstUsesLowSrc; + LoSrc = stripBitcast(MRI, LoSrc); + LoSrc = stripExtract(MRI, LoSrc); + + bool HiIsImplicitDef = isImplicitDef(MRI, HiSrc); + if (!HiIsImplicitDef) { + if (mi_match(HiSrc, MRI, m_GAnyExt(m_GFNeg(m_GTrunc(m_Reg(HiSrc)))))) + Mods ^= SISrcMods::NEG_HI; + highDstUsesHighSrc = + isShiftHiToLo(MRI, HiSrc, HiSrc) == highDstUsesHighSrc; + HiSrc = stripBitcast(MRI, HiSrc); + HiSrc = stripExtract(MRI, HiSrc); + } + + if (mi_match(LoSrc, MRI, m_GFNeg(m_Reg(LoSrc)))) + Mods ^= SISrcMods::NEG; + if (mi_match(HiSrc, MRI, m_GFNeg(m_Reg(HiSrc)))) + Mods ^= SISrcMods::NEG_HI; + + if (HiIsImplicitDef || LoSrc == HiSrc) { + Src = LoSrc; + MI = MRI.getVRegDef(Src); + } + } + } // Packed instructions do not have abs modifiers. - Mods |= SISrcMods::OP_SEL_1; + if (!lowDstUsesLowSrc) + Mods |= SISrcMods::OP_SEL_0; + if (highDstUsesHighSrc) + Mods |= SISrcMods::OP_SEL_1; return std::make_pair(Src, Mods); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -622,229 +622,117 @@ ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_pk_add_f16 v1, v4, v1 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v9, s4 ; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-DENORM-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v4, v1 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v5, v5, v9, s4 ; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v4, s4 +; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v4, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v4, s4 +; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v4, v2 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -272,23 +272,20 @@ ; GFX906-LABEL: v_sdot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -300,23 +297,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +321,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -114,16 +114,14 @@ ; GFX906-LABEL: v_sdot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll @@ -70,16 +70,14 @@ ; GFX906-LABEL: v_sdot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -272,23 +272,20 @@ ; GFX906-LABEL: v_udot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -300,23 +297,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +321,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -114,16 +114,14 @@ ; GFX906-LABEL: v_udot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot4_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll @@ -70,16 +70,14 @@ ; GFX906-LABEL: v_udot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/packed-op-sel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/packed-op-sel.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s + +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) +declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) +declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) + +; flip elements by extracting and inserting +define <2 x half> @insert_flip(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: insert_flip: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %vecz0 = extractelement <2 x half> %vecz, i32 0 + %vecz1 = extractelement <2 x half> %vecz, i32 1 + %newvecz0 = insertelement <2 x half> undef, half %vecz0, i32 1 + %newvecz = insertelement <2 x half> %newvecz0, half %vecz1, i32 0 + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %newvecz) + ret <2 x half> %res +} + +; flip elements by extracting and inserting with various negates +define <2 x half> @insert_flip_negate(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: insert_flip_negate: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negz = fneg <2 x half> %vecz + %vecz0 = extractelement <2 x half> %negz, i32 1 + %vecz1 = extractelement <2 x half> %vecz, i32 0 + %negz0 = fneg half %vecz0 + %negz1 = fneg half %vecz1 + %newvecz0 = insertelement <2 x half> undef, half %negz0, i32 0 + %newvecz = insertelement <2 x half> %newvecz0, half %negz1, i32 1 + %negnewvecz = fneg <2 x half> %newvecz + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %negnewvecz) + ret <2 x half> %res +} + +; multiple flips by extract/insert +define <2 x half> @multi_insert_flip_negate(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: multi_insert_flip_negate: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %Ae0 = extractelement <2 x half> %vecz, i32 0 + %Ae1 = extractelement <2 x half> %vecz, i32 1 + %Avec0 = insertelement <2 x half> undef, half %Ae0, i32 0 + %Avec = insertelement <2 x half> %Avec0, half %Ae1, i32 1 + + %Anegvec = fneg <2 x half> %Avec + + %Be0 = extractelement <2 x half> %Anegvec, i32 0 + %Be1 = extractelement <2 x half> %Anegvec, i32 1 + %Bvec0 = insertelement <2 x half> undef, half %Be1, i32 0 + %Bvec = insertelement <2 x half> %Bvec0, half %Be0, i32 1 + + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %Bvec) + ret <2 x half> %res +} + +; shuffle elements +define <2 x half> @shuffle_v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipz) + ret <2 x half> %res +} + +; shuffle elements then negate +define <2 x half> @shuffle_v2f16_negate_after(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16_negate_after: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> + %negz = fneg <2 x half> %flipz + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %negz) + ret <2 x half> %res +} + +; negate elements then shuffle +define <2 x half> @shuffle_v2f16_negate_before(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16_negate_before: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negz = fneg <2 x half> %vecz + %flipz = shufflevector <2 x half> %negz, <2 x half> undef, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipz) + ret <2 x half> %res +} + +; multiple shuffles +define <2 x half> @multi_shuffle_v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: multi_shuffle_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> + %flipzz = shufflevector <2 x half> %flipz, <2 x half> undef, <2 x i32> + %flipzzz = shufflevector <2 x half> %flipzz, <2 x half> undef, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipzzz) + ret <2 x half> %res +} + +; flip elements (implicit def case) +define <3 x half> @insert_flip_v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %vecz) { +; CHECK-LABEL: insert_flip_v3f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffff +; CHECK-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; CHECK-NEXT: s_lshl_b32 s4, s4, 16 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CHECK-NEXT: v_and_or_b32 v1, v1, v4, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CHECK-NEXT: v_and_or_b32 v0, v0, v4, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %vecz0 = extractelement <3 x half> %vecz, i32 0 + %vecz1 = extractelement <3 x half> %vecz, i32 1 + %vecz2 = extractelement <3 x half> %vecz, i32 2 + %newvecz0 = insertelement <3 x half> undef, half %vecz0, i32 1 + %newvecz1 = insertelement <3 x half> %newvecz0, half %vecz1, i32 0 + %newvecz = insertelement <3 x half> %newvecz1, half %vecz2, i32 2 + %res = call <3 x half> @llvm.fma.v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %newvecz) + ret <3 x half> %res +} + +; shuffle elements (implicit def case) +define <3 x half> @shuffle_v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %vecz) { +; CHECK-LABEL: shuffle_v3f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffff +; CHECK-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; CHECK-NEXT: s_lshl_b32 s4, s4, 16 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CHECK-NEXT: v_and_or_b32 v1, v1, v4, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CHECK-NEXT: v_and_or_b32 v0, v0, v4, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <3 x half> %vecz, <3 x half> %vecz, <3 x i32> + %res = call <3 x half> @llvm.fma.v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %flipz) + ret <3 x half> %res +} + +; shuffle elements (shuffle vector for vectors with more than 2 elements do not use G_SHUFFLE_VECTOR) +define <4 x half> @shuffle_v4f16(<4 x half> %vecx, <4 x half> %vecy, <4 x half> %vecz) { +; CHECK-LABEL: shuffle_v4f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: v_pk_fma_f16 v1, v1, v3, v5 op_sel_hi:[1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <4 x half> %vecz, <4 x half> %vecz, <4 x i32> + %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %vecx, <4 x half> %vecy, <4 x half> %flipz) + ret <4 x half> %res +}