Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -181,6 +181,9 @@ InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; + std::pair + selectVOP3OpSelModsImpl(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3394,12 +3394,25 @@ }}; } +static bool isExtractHiElt(const MachineRegisterInfo &MRI, Register In, + Register &Out) { + Register LShlSrc; + if (mi_match(In, MRI, + (m_GTrunc(m_GLShr(m_Reg(LShlSrc), (m_SpecificICst(16))))))) { + Out = LShlSrc; + return true; + } + return false; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { Register Reg = Root.getReg(); const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + Register Tmp; if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || - Def->getOpcode() == AMDGPU::G_FABS)) + Def->getOpcode() == AMDGPU::G_FABS || + isExtractHiElt(*MRI, Def->getOperand(0).getReg(), Tmp))) return {}; return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, @@ -3542,13 +3555,62 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { - // FIXME: Handle op_sel + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3OpSelModsImpl(Root); + return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } +std::pair +AMDGPUInstructionSelector::selectVOP3OpSelModsImpl(MachineOperand &Root) const { + unsigned Mods = 0; + Register Src = Root.getReg(); + + // Instructions that can be folded into source modifiers can appear arbitrary + // number of times and in arbitrary order. + Register OldSrc; + while (OldSrc != Src) { + OldSrc = Src; + + MachineInstr *MI = MRI->getVRegDef(Src); + const unsigned Opcode = MI->getOpcode(); + + if (Opcode == AMDGPU::G_FNEG) { + Src = MI->getOperand(1).getReg(); + // If there is an abs after neg, then this neg can be ignored. + if (!(Mods & SISrcMods::ABS)) + Mods ^= SISrcMods::NEG; + } else if (Opcode == AMDGPU::G_FABS) { + Src = MI->getOperand(1).getReg(); + Mods |= SISrcMods::ABS; + } else if (Opcode == AMDGPU::G_BITCAST) { + // Strip bitcast + Src = MI->getOperand(1).getReg(); + } else if (isExtractHiElt(*MRI, Src, Src)) { + // isExtractHiElt already updated Src + Mods ^= SISrcMods::OP_SEL_0; + } else if (Opcode == AMDGPU::G_SHUFFLE_VECTOR) { + const auto ShuffleMask = MI->getOperand(3).getShuffleMask(); + if (ShuffleMask.size() == 2) { + if (ShuffleMask[0] == 1) + Mods ^= SISrcMods::OP_SEL_0; + Src = MI->getOperand(1).getReg(); + } + } else if (Opcode == AMDGPU::G_TRUNC) { + // Strip trunc(bitcast(x)) + MachineInstr *TruncSrc = MRI->getVRegDef(MI->getOperand(1).getReg()); + if (TruncSrc->getOpcode() == AMDGPU::G_BITCAST) + Src = TruncSrc->getOperand(1).getReg(); + } + } + + return std::make_pair(Src, Mods); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const { SmallVector AddrInfo; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -114,8 +114,7 @@ ; GFX9-CONTRACT-LABEL: test_half_sub_mul: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_half_sub_mul: @@ -136,8 +135,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_sub_mul: @@ -164,8 +162,7 @@ ; GFX9-CONTRACT-LABEL: test_half_sub_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_half_sub_mul_rhs: @@ -186,8 +183,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_sub_mul_rhs: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -66,10 +66,7 @@ ; GFX9-CONTRACT-LABEL: test_f16_sub_ext_neg_mul: ; GFX9-CONTRACT: ; %bb.0: ; %entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, -v1, -v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_f16_sub_ext_neg_mul: @@ -90,10 +87,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, -v1, -v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_f16_sub_ext_neg_mul: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -592,43 +592,39 @@ ; GFX9-LABEL: v_fdiv_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_div_fixup_f16 v2, v2, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 op_sel:[0,1,1,0] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_div_fixup_f16 v2, v2, v1, v0 op_sel:[0,1,1,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -786,43 +782,39 @@ ; GFX9-LABEL: v_fdiv_v2f16_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_div_fixup_f16 v2, v2, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 op_sel:[0,1,1,0] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_div_fixup_f16 v2, v2, v1, v0 op_sel:[0,1,1,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -925,39 +917,37 @@ ; GFX9-LABEL: v_rcp_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 op_sel:[0,1,0,0] ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 op_sel:[0,1,0,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1060,39 +1050,37 @@ ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 op_sel:[0,1,0,0] ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 op_sel:[0,1,0,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1399,43 +1387,39 @@ ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_div_fixup_f16 v2, v2, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 op_sel:[0,1,1,0] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_div_fixup_f16 v2, v2, v1, v0 op_sel:[0,1,1,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/vop3-op-sel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/vop3-op-sel.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s + +declare half @llvm.fma.f16(half, half, half) +declare half @llvm.fabs.f16(half) +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) + +define half @neg_with_opsel_after(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: neg_with_opsel_after: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, -v0, v1, -v2 op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negX = fneg half %x + %negvecz = fneg <2 x half> %vecz + %z = extractelement <2 x half> %negvecz, i32 1 + %res = call half @llvm.fma.f16(half %negX, half %y, half %z) + ret half %res +} + +define half @neg_with_opsel_before(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: neg_with_opsel_before: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, -v2 op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %z = extractelement <2 x half> %vecz, i32 1 + %negz = fneg half %z + %res = call half @llvm.fma.f16(half %x, half %y, half %negz) + ret half %res +} + +define half @shuffle_with_two_negs(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_with_two_negs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negvecz = fneg <2 x half> %vecz + %shufflevecz = shufflevector <2 x half> %negvecz, <2 x half> undef, <2 x i32> + %z = extractelement <2 x half> %shufflevecz, i32 0 + %negz = fneg half %z + %res = call half @llvm.fma.f16(half %x, half %y, half %negz) + ret half %res +} + +define half @abs_with_opsel_after(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: abs_with_opsel_after: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, |v2| op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %absvecz = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vecz) + %z = extractelement <2 x half> %absvecz, i32 1 + %res = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %res +} + +define half @abs_with_opsel_before(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: abs_with_opsel_before: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, |v2| op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %z = extractelement <2 x half> %vecz, i32 1 + %absz = call half @llvm.fabs.f16(half %z) + %res = call half @llvm.fma.f16(half %x, half %y, half %absz) + ret half %res +} + +define half @neg_before_abs(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: neg_before_abs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, |v2| op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negvecz = fneg <2 x half> %vecz + %z = extractelement <2 x half> %negvecz, i32 1 + %absz = call half @llvm.fabs.f16(half %z) + %res = call half @llvm.fma.f16(half %x, half %y, half %absz) + ret half %res +}