Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3405,26 +3405,119 @@ }}; } +static Register stripBitcast(const MachineRegisterInfo &MRI, Register Src) { + const MachineInstr *MI = MRI.getVRegDef(Src); + return MI->getOpcode() == AMDGPU::G_BITCAST ? MI->getOperand(1).getReg() + : Src; +} + +static bool isShiftHiToLo(const MachineRegisterInfo &MRI, Register In, + Register &Out) { + Register Tmp; + if (mi_match(In, MRI, m_GLShr(m_Reg(Tmp), m_SpecificICst(16)))) { + Out = Tmp; + return true; + } + return false; +} + std::pair AMDGPUInstructionSelector::selectVOP3PModsImpl( Register Src, const MachineRegisterInfo &MRI) const { - unsigned Mods = 0; - MachineInstr *MI = MRI.getVRegDef(Src); + unsigned Mods = SISrcMods::OP_SEL_1; // default + + // Instructions that can be folded into source modifiers can appear arbitrary + // number of times and in arbitrary order. + Register OldSrc; + while (OldSrc != Src) { + OldSrc = Src; + + Register LoSrc, HiSrc; + MachineInstr *MI = MRI.getVRegDef(Src); + const unsigned Opcode = MI->getOpcode(); + + if (Opcode == AMDGPU::G_FNEG && + // It's possible to see an f32 fneg here, but unlikely. + // TODO: Treat f32 fneg as only high bit. + MRI.getType(Src) == LLT::fixed_vector(2, 16)) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = MI->getOperand(1).getReg(); + } else if (Opcode == AMDGPU::G_BITCAST) { + // Strip bitcast + Src = MI->getOperand(1).getReg(); + } else if (Opcode == AMDGPU::G_SHUFFLE_VECTOR) { + // Check if <2 x 16s> vector is shuffled and update opsel modifiers. + ArrayRef ShuffleMask = MI->getOperand(3).getShuffleMask(); + if (ShuffleMask.size() == 2 && + MRI.getType(Src) == LLT::fixed_vector(2, 16)) { + Register Vec1 = MI->getOperand(1).getReg(); + Register Vec2 = MI->getOperand(2).getReg(); + Register NewSrc; + if ((ShuffleMask[0] < 2 && ShuffleMask[1] < 2) || (Vec1 == Vec2)) + NewSrc = Vec1; + else if (ShuffleMask[0] > 1 && ShuffleMask[1] > 1) + NewSrc = Vec2; + + if (NewSrc) { + if (ShuffleMask[0] & 1) + Mods ^= SISrcMods::OP_SEL_0; + if (!(ShuffleMask[1] & 1)) + Mods ^= SISrcMods::OP_SEL_1; + Src = NewSrc; + } + } + } else if (mi_match(Src, MRI, + m_GOr(m_GAnd(m_Reg(LoSrc), m_SpecificICst(0xffff)), + m_GShl(m_Reg(HiSrc), m_SpecificICst(16))))) { + // LoSrc and HiSrc represent s32 registers whose low 16 bits are used to + // form <2 x s16> operand in packed instruction. + + bool NegLo = false, LoIsShifted = false; + bool LoIsImplicitDef = + MRI.getVRegDef(LoSrc)->getOpcode() == AMDGPU::G_IMPLICIT_DEF; + // Ignore LoSrc if it's undef. + if (!LoIsImplicitDef) { + // Check for fneg for 16bit float. Since LoSrc is s32 it will be wrapped + // with anyext and trunc. + NegLo = + mi_match(LoSrc, MRI, m_GAnyExt(m_GFNeg(m_GTrunc(m_Reg(LoSrc))))); + // Check if high 16 bits of LoSrc are used instead. + LoIsShifted = isShiftHiToLo(MRI, LoSrc, LoSrc); + // Lookthrough <2 x s16> to s32 bitcast. + LoSrc = stripBitcast(MRI, LoSrc); + // Potential fneg on <2 x s16> will not be wrapped with anyext and trunc + // like above. + NegLo ^= mi_match(LoSrc, MRI, m_GFNeg(m_Reg(LoSrc))); + } - if (MI && MI->getOpcode() == AMDGPU::G_FNEG && - // It's possible to see an f32 fneg here, but unlikely. - // TODO: Treat f32 fneg as only high bit. - MRI.getType(Src) == LLT::fixed_vector(2, 16)) { - Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - Src = MI->getOperand(1).getReg(); - MI = MRI.getVRegDef(Src); - } + bool NegHi = false, HiIsShifted = true; + bool HiIsImplicitDef = + MRI.getVRegDef(HiSrc)->getOpcode() == AMDGPU::G_IMPLICIT_DEF; + if (!HiIsImplicitDef) { + NegHi = + mi_match(HiSrc, MRI, m_GAnyExt(m_GFNeg(m_GTrunc(m_Reg(HiSrc))))); + HiIsShifted = isShiftHiToLo(MRI, HiSrc, HiSrc); + HiSrc = stripBitcast(MRI, HiSrc); + NegHi ^= mi_match(HiSrc, MRI, m_GFNeg(m_Reg(HiSrc))); + } - // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + // If LoSrc and HiSrc are same register (or at least one is undef) we can + // use it as new source with appropriate source modifiers. + if (LoIsImplicitDef || HiIsImplicitDef || LoSrc == HiSrc) { + if (NegLo) + Mods ^= SISrcMods::NEG; + if (LoIsShifted) + Mods ^= SISrcMods::OP_SEL_0; + if (NegHi) + Mods ^= SISrcMods::NEG_HI; + if (!HiIsShifted) + Mods ^= SISrcMods::OP_SEL_1; + Src = LoIsImplicitDef ? HiSrc : LoSrc; + } + } + } // Packed instructions do not have abs modifiers. - Mods |= SISrcMods::OP_SEL_1; - return std::make_pair(Src, Mods); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -786,229 +786,117 @@ ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_pk_add_f16 v1, v4, v1 +; GFX9-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v9, s4 ; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-DENORM-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-DENORM-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v3, v4, v9, v3 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v3, v0 -; GFX9-DENORM-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 ; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v4, v1 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v6 -; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v6 -; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v3, v3, v9, s4 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v5, v5, v9, s4 ; GFX9-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-UNSAFE-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v2 -; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-UNSAFE-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-UNSAFE-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v4, s4 +; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v4, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-DENORM: ; %bb.0: ; %.entry ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7 -; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v3, s4 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4 -; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 -; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v3, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff -; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4 -; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 -; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v4, s4 +; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v4, v2 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -272,23 +272,20 @@ ; GFX906-LABEL: v_sdot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -300,23 +297,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +321,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -114,16 +114,14 @@ ; GFX906-LABEL: v_sdot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll @@ -70,16 +70,14 @@ ; GFX906-LABEL: v_sdot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -272,23 +272,20 @@ ; GFX906-LABEL: v_udot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -300,23 +297,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +321,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -114,16 +114,14 @@ ; GFX906-LABEL: v_udot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot4_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll @@ -70,16 +70,14 @@ ; GFX906-LABEL: v_udot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/packed-op-sel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/packed-op-sel.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s + +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) +declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) +declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) + +; flip elements by extracting and inserting +define <2 x half> @insert_flip(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: insert_flip: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %vecz0 = extractelement <2 x half> %vecz, i32 0 + %vecz1 = extractelement <2 x half> %vecz, i32 1 + %newvecz0 = insertelement <2 x half> undef, half %vecz0, i32 1 + %newvecz = insertelement <2 x half> %newvecz0, half %vecz1, i32 0 + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %newvecz) + ret <2 x half> %res +} + +; flip elements by extracting and inserting with various negates +define <2 x half> @insert_flip_negate(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: insert_flip_negate: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negz = fneg <2 x half> %vecz + %vecz0 = extractelement <2 x half> %negz, i32 1 + %vecz1 = extractelement <2 x half> %vecz, i32 0 + %negz0 = fneg half %vecz0 + %negz1 = fneg half %vecz1 + %newvecz0 = insertelement <2 x half> undef, half %negz0, i32 0 + %newvecz = insertelement <2 x half> %newvecz0, half %negz1, i32 1 + %negnewvecz = fneg <2 x half> %newvecz + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %negnewvecz) + ret <2 x half> %res +} + +; make <2 x s16> vectors where either low or high part is undef +define <2 x half> @insert_with_undef(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: insert_with_undef: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %vecx1 = extractelement <2 x half> %vecx, i32 1 + %newvecx = insertelement <2 x half> undef, half %vecx1, i32 1 + + %vecy0 = extractelement <2 x half> %vecy, i32 0 + %newvecy = insertelement <2 x half> undef, half %vecy0, i32 0 + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %newvecx, <2 x half> %newvecy, <2 x half> %vecz) + ret <2 x half> %res +} + +; multiple flips by extract/insert +define <2 x half> @multi_insert_flip_negate(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: multi_insert_flip_negate: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %Ae0 = extractelement <2 x half> %vecz, i32 0 + %Ae1 = extractelement <2 x half> %vecz, i32 1 + %Avec0 = insertelement <2 x half> undef, half %Ae0, i32 0 + %Avec = insertelement <2 x half> %Avec0, half %Ae1, i32 1 + + %Anegvec = fneg <2 x half> %Avec + + %Be0 = extractelement <2 x half> %Anegvec, i32 0 + %Be1 = extractelement <2 x half> %Anegvec, i32 1 + %Bvec0 = insertelement <2 x half> undef, half %Be1, i32 0 + %Bvec = insertelement <2 x half> %Bvec0, half %Be0, i32 1 + + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %Bvec) + ret <2 x half> %res +} + +; shuffle elements +define <2 x half> @shuffle_v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %shufflex = shufflevector <2 x half> %vecx, <2 x half> undef, <2 x i32> + %shuffley = shufflevector <2 x half> undef, <2 x half> %vecy, <2 x i32> + %shufflez = shufflevector <2 x half> %vecz, <2 x half> %vecz, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %shufflex, <2 x half> %shuffley, <2 x half> %shufflez) + ret <2 x half> %res +} + +; shuffle elements then negate +define <2 x half> @shuffle_v2f16_negate_after(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16_negate_after: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> + %negz = fneg <2 x half> %flipz + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %negz) + ret <2 x half> %res +} + +; negate elements then shuffle +define <2 x half> @shuffle_v2f16_negate_before(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16_negate_before: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negz = fneg <2 x half> %vecz + %flipz = shufflevector <2 x half> %negz, <2 x half> undef, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipz) + ret <2 x half> %res +} + +; consecutive shuffles +define <2 x half> @multi_shuffle_v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: multi_shuffle_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <2 x half> %vecz, <2 x half> undef, <2 x i32> + %flipzz = shufflevector <2 x half> %flipz, <2 x half> undef, <2 x i32> + %flipzzz = shufflevector <2 x half> %flipzz, <2 x half> undef, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %flipzzz) + ret <2 x half> %res +} + +; shuffle elements (shuffle vector for vectors with more than 2 elements does not use G_SHUFFLE_VECTOR) +define <4 x half> @shuffle_v4f16(<4 x half> %vecx, <4 x half> %vecy, <4 x half> %vecz) { +; CHECK-LABEL: shuffle_v4f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: v_pk_fma_f16 v1, v1, v3, v5 op_sel_hi:[1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <4 x half> %vecz, <4 x half> undef, <4 x i32> + %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %vecx, <4 x half> %vecy, <4 x half> %flipz) + ret <4 x half> %res +} + +; shuffle while picking elements from different arguments but arguments are identical +define <2 x half> @shuffle_v2f16_same_source(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_v2f16_same_source: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,0,1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipX = shufflevector <2 x half> %vecx, <2 x half> %vecx, <2 x i32> + %duplY = shufflevector <2 x half> %vecy, <2 x half> %vecy, <2 x i32> + %sameZ = shufflevector <2 x half> %vecz, <2 x half> %vecz, <2 x i32> + %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %flipX, <2 x half> %duplY, <2 x half> %sameZ) + ret <2 x half> %res +} + +; flip elements (implicit def case) +define <3 x half> @insert_flip_v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %vecz) { +; CHECK-LABEL: insert_flip_v3f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffff +; CHECK-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; CHECK-NEXT: s_lshl_b32 s4, s4, 16 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CHECK-NEXT: v_and_or_b32 v1, v1, v4, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CHECK-NEXT: v_and_or_b32 v0, v0, v4, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %vecz0 = extractelement <3 x half> %vecz, i32 0 + %vecz1 = extractelement <3 x half> %vecz, i32 1 + %vecz2 = extractelement <3 x half> %vecz, i32 2 + %newvecz0 = insertelement <3 x half> undef, half %vecz0, i32 1 + %newvecz1 = insertelement <3 x half> %newvecz0, half %vecz1, i32 0 + %newvecz = insertelement <3 x half> %newvecz1, half %vecz2, i32 2 + %res = call <3 x half> @llvm.fma.v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %newvecz) + ret <3 x half> %res +} + +; shuffle elements (implicit def case) +define <3 x half> @shuffle_v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %vecz) { +; CHECK-LABEL: shuffle_v3f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_pk_fma_f16 v0, v0, v2, v4 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; CHECK-NEXT: v_mov_b32_e32 v4, 0xffff +; CHECK-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; CHECK-NEXT: s_lshl_b32 s4, s4, 16 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CHECK-NEXT: v_and_or_b32 v1, v1, v4, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CHECK-NEXT: v_and_or_b32 v0, v0, v4, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %flipz = shufflevector <3 x half> %vecz, <3 x half> %vecz, <3 x i32> + %res = call <3 x half> @llvm.fma.v3f16(<3 x half> %vecx, <3 x half> %vecy, <3 x half> %flipz) + ret <3 x half> %res +}