Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -167,8 +167,6 @@ InstructionSelector::ComplexRendererFns selectVOP3BMods(MachineOperand &Root) const; - ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns selectVOP3Mods_nnan(MachineOperand &Root) const; @@ -181,6 +179,12 @@ InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; + std::pair + selectVOP3OpSelModsImpl(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectVOP3NoMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectSmrdImm(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3393,18 +3393,6 @@ }}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { - Register Reg = Root.getReg(); - const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); - if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || - Def->getOpcode() == AMDGPU::G_FABS)) - return {}; - return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, - }}; -} - static Register stripBitcast(const MachineRegisterInfo &MRI, Register Src) { const MachineInstr *MI = MRI.getVRegDef(Src); return MI->getOpcode() == AMDGPU::G_BITCAST ? MI->getOperand(1).getReg() @@ -3550,12 +3538,141 @@ }}; } +static Register isExtractHiElt(const MachineRegisterInfo &MRI, Register In) { + if (MRI.getType(In) == LLT::scalar(16)) + if (!mi_match(In, MRI, m_GTrunc(m_Reg(In)))) + return Register(); + + Register Tmp; + if (MRI.getType(In) == LLT::scalar(32) && + mi_match(In, MRI, m_GLShr(m_Reg(Tmp), m_SpecificICst(16)))) + return Tmp; + return Register(); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { - // FIXME: Handle op_sel + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3OpSelModsImpl(Root); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +static void selectVOPOpSelModsForInst(const MachineRegisterInfo &MRI, + Register &OldSrc, Register &Src, + unsigned &Mods) { + MachineInstr *MI = MRI.getVRegDef(Src); + unsigned Opcode = MI->getOpcode(); + + // Look through no-op instructions + if (Opcode == AMDGPU::G_BITCAST) { + // Strip bitcast + Src = MI->getOperand(1).getReg(); + } else if (Opcode == AMDGPU::G_TRUNC) { + // Strip trunc(bitcast(x)) + MachineInstr *TmpMI = MRI.getVRegDef(MI->getOperand(1).getReg()); + if (TmpMI->getOpcode() == AMDGPU::G_BITCAST) + Src = TmpMI->getOperand(1).getReg(); + } + + // FIXME: Add combine for eliminating consecutive bitcasts. + Src = stripBitcast(MRI, Src); + + // Look through 32bit registers (before bitcast to <2 x 16>) that are formed + // from low 16 bits of two other registers. + Register LoSrc, HiSrc; + if (MRI.getType(Src) == LLT::scalar(32) && + mi_match(Src, MRI, + m_GOr(m_GAnd(m_Reg(LoSrc), m_SpecificICst(0xffff)), + m_GShl(m_Reg(HiSrc), m_SpecificICst(16))))) { + Src = Mods & SISrcMods::OP_SEL_0 ? HiSrc : LoSrc; + Mods &= ~SISrcMods::OP_SEL_0; + } + + if (OldSrc != Src) { + OldSrc = Src; // If no mods are selected below we do not need to enter + // next iteration. + MI = MRI.getVRegDef(Src); + Opcode = MI->getOpcode(); + } + + LLT SrcTy = MRI.getType(Src); + + // Select source modifiers + if (Opcode == AMDGPU::G_FNEG) { + if (SrcTy == LLT::scalar(16) || SrcTy == LLT::fixed_vector(2, 16)) { + Src = MI->getOperand(1).getReg(); + // If there is an abs after neg, then this neg can be ignored. + if (!(Mods & SISrcMods::ABS)) + Mods ^= SISrcMods::NEG; + } + } else if (Opcode == AMDGPU::G_FABS) { + if (SrcTy == LLT::scalar(16) || SrcTy == LLT::fixed_vector(2, 16)) { + Src = MI->getOperand(1).getReg(); + Mods |= SISrcMods::ABS; + } + } else if (Register NewSrc = isExtractHiElt(MRI, Src)) { + Src = NewSrc; + Mods ^= SISrcMods::OP_SEL_0; + } else if (Opcode == AMDGPU::G_SHUFFLE_VECTOR) { + ArrayRef ShufMask = MI->getOperand(3).getShuffleMask(); + if (ShufMask.size() == 2 && SrcTy == LLT::fixed_vector(2, 16)) { + unsigned Idx = Mods & SISrcMods::OP_SEL_0 ? ShufMask[1] : ShufMask[0]; + + // Odd value means we're taking hi part of pre-shuffled <2 x 16> register, + // even means low part. Set op_sel modifier accordingly. + if (Idx & 1) + Mods |= SISrcMods::OP_SEL_0; + else + Mods &= ~SISrcMods::OP_SEL_0; + + Src = (Idx & 2) ? MI->getOperand(2).getReg() : MI->getOperand(1).getReg(); + } + } +} + +std::pair +AMDGPUInstructionSelector::selectVOP3OpSelModsImpl(MachineOperand &Root) const { + unsigned Mods = 0; + Register Src = Root.getReg(); + + // Instructions that can be folded into source modifiers can appear arbitrary + // number of times and in arbitrary order. + Register OldSrc; + while (OldSrc != Src) { + OldSrc = Src; + + selectVOPOpSelModsForInst(*MRI, OldSrc, Src, Mods); + } + + return std::make_pair(Src, Mods); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { + Register Reg = Root.getReg(); + + if (MRI->getType(Reg) == LLT::scalar(16)) { + // Run first iteration for selecting source modifiers for VOP3OpSel + Register Src = Reg; + Register OldSrc = Src; + unsigned Mods = 0; + selectVOPOpSelModsForInst(*MRI, OldSrc, Src, Mods); + if (Mods != 0) + return {}; + } else { + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def && (Def->getOpcode() == AMDGPU::G_FNEG || + Def->getOpcode() == AMDGPU::G_FABS)) + return {}; + } + return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, }}; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -114,8 +114,7 @@ ; GFX9-CONTRACT-LABEL: test_half_sub_mul: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_half_sub_mul: @@ -136,8 +135,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_sub_mul: @@ -164,8 +162,7 @@ ; GFX9-CONTRACT-LABEL: test_half_sub_mul_rhs: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_half_sub_mul_rhs: @@ -186,8 +183,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_half_sub_mul_rhs: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -66,10 +66,7 @@ ; GFX9-CONTRACT-LABEL: test_f16_sub_ext_neg_mul: ; GFX9-CONTRACT: ; %bb.0: ; %entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, -v1, -v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-DENORM-LABEL: test_f16_sub_ext_neg_mul: @@ -90,10 +87,7 @@ ; GFX10-CONTRACT: ; %bb.0: ; %entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 -; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, -v1, -v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_f16_sub_ext_neg_mul: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -592,43 +592,39 @@ ; GFX9-LABEL: v_fdiv_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_div_fixup_f16 v2, v2, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 op_sel:[0,1,1,0] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_div_fixup_f16 v2, v2, v1, v0 op_sel:[0,1,1,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -786,43 +782,39 @@ ; GFX9-LABEL: v_fdiv_v2f16_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_div_fixup_f16 v2, v2, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 op_sel:[0,1,1,0] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_div_fixup_f16 v2, v2, v1, v0 op_sel:[0,1,1,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -925,39 +917,37 @@ ; GFX9-LABEL: v_rcp_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 op_sel:[0,1,0,0] ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 op_sel:[0,1,0,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1060,39 +1050,37 @@ ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 op_sel:[0,1,0,0] ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_arcp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, 1.0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX10-NEXT: v_div_fixup_f16 v1, v1, v0, 1.0 op_sel:[0,1,0,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1399,43 +1387,39 @@ ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_div_fixup_f16 v2, v2, v1, v0 +; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 op_sel:[0,1,1,0] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_div_fixup_f16 v0, v4, v1, v0 -; GFX10-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX10-NEXT: v_div_fixup_f16 v2, v2, v1, v0 op_sel:[0,1,1,0] +; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/vop3-op-sel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/vop3-op-sel.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s + +declare half @llvm.fma.f16(half, half, half) +declare half @llvm.fabs.f16(half) +declare float @llvm.fabs.f32(float) +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) + +define half @neg_with_opsel_after(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: neg_with_opsel_after: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, -v0, v1, -v2 op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negX = fneg half %x + %negvecz = fneg <2 x half> %vecz + %z = extractelement <2 x half> %negvecz, i32 1 + %res = call half @llvm.fma.f16(half %negX, half %y, half %z) + ret half %res +} + +define half @neg_with_opsel_before(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: neg_with_opsel_before: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, -v2 op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %z = extractelement <2 x half> %vecz, i32 1 + %negz = fneg half %z + %res = call half @llvm.fma.f16(half %x, half %y, half %negz) + ret half %res +} + +define half @shuffle_with_two_negs(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: shuffle_with_two_negs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, v2 op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negvecz = fneg <2 x half> %vecz + %shufflevecz = shufflevector <2 x half> %negvecz, <2 x half> undef, <2 x i32> + %z = extractelement <2 x half> %shufflevecz, i32 0 + %negz = fneg half %z + %res = call half @llvm.fma.f16(half %x, half %y, half %negz) + ret half %res +} + +define half @abs_with_opsel_after(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: abs_with_opsel_after: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, |v2| op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %absvecz = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vecz) + %z = extractelement <2 x half> %absvecz, i32 1 + %res = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %res +} + +define half @abs_with_opsel_before(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: abs_with_opsel_before: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, |v2| op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %z = extractelement <2 x half> %vecz, i32 1 + %absz = call half @llvm.fabs.f16(half %z) + %res = call half @llvm.fma.f16(half %x, half %y, half %absz) + ret half %res +} + +define half @neg_before_abs(half %x, half %y, <2 x half> %vecz) { +; CHECK-LABEL: neg_before_abs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, |v2| op_sel:[0,0,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negvecz = fneg <2 x half> %vecz + %z = extractelement <2 x half> %negvecz, i32 1 + %absz = call half @llvm.fabs.f16(half %z) + %res = call half @llvm.fma.f16(half %x, half %y, half %absz) + ret half %res +} + +; FIXME: These fneg and shufflevector instructions should be eliminated before +; instruction select so we can use vop2 instruction instead. +define half @redundand_modifiers(<2 x half> %z, half %x, half %y) { +; CHECK-LABEL: redundand_modifiers: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, v1, v2, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %z1 = fneg <2 x half> %z + %z2 = shufflevector <2 x half> %z1, <2 x half> undef, <2 x i32> + %z3 = fneg <2 x half> %z2 + %z4 = shufflevector <2 x half> %z3, <2 x half> undef, <2 x i32> + %z5 = extractelement <2 x half> %z4, i32 0 + %res = call half @llvm.fma.f16(half %x, half %y, half %z5) + ret half %res +} + +; Do not fold f32 neg and abs +define half @type_check_fneg_fabs(float %x, float %y, <2 x half> %vecz) { +; CHECK-LABEL: type_check_fneg_fabs: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CHECK-NEXT: v_fma_f16 v0, v0, v1, -|v2| op_sel:[1,1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %negX = fneg float %x + %vecX = bitcast float %negX to <2 x half> + %hiX = extractelement <2 x half> %vecX, i32 1 + + %absY = call float @llvm.fabs.f32(float %y) + %vecY = bitcast float %absY to <2 x half> + %hiY = extractelement <2 x half> %vecY, i32 1 + + %absZ = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vecz) + %hiZ = extractelement <2 x half> %absZ, i32 1 + %negZ = fneg half %hiZ + + %res = call half @llvm.fma.f16(half %hiX, half %hiY, half %negZ) + ret half %res +} + +define half @shuffle_tests(<2 x half> %vecx, <2 x half> %vecy, <2 x half> %vecz, <2 x half> %vecz2) { +; CHECK-LABEL: shuffle_tests: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_fma_f16 v0, -|v0|, -v1, v3 op_sel:[0,1,1,0] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %absvecx = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vecx) + %newvecx = shufflevector <2 x half> undef, <2 x half> %absvecx, <2 x i32> + %negvecx = fneg <2 x half> %newvecx + %x = extractelement <2 x half> %negvecx, i32 1 + + %newvecy = shufflevector <2 x half> %vecy, <2 x half> undef, <2 x i32> + %negvecy = fneg <2 x half> %newvecy + %y = extractelement <2 x half> %negvecy, i32 1 + + %newvecz = shufflevector <2 x half> %vecz, <2 x half> %vecz2, <2 x i32> + %z = extractelement <2 x half> %newvecz, i32 0 + + %res = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %res +}