Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -47,6 +47,14 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3pmods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_vop3pmods0 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -147,6 +147,15 @@ InstructionSelector::ComplexRendererFns selectVOP3Mods_nnan(MachineOperand &Root) const; + std::pair + selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + + InstructionSelector::ComplexRendererFns + selectVOP3PMods(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectVOP3PMods0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods0(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2387,6 +2387,58 @@ }}; } +std::pair +AMDGPUInstructionSelector::selectVOP3PModsImpl( + Register Src, const MachineRegisterInfo &MRI) const { + unsigned Mods = 0; + MachineInstr *MI = MRI.getVRegDef(Src); + + if (MI && MI->getOpcode() == AMDGPU::G_FNEG) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + + // Packed instructions do not have abs modifiers. + Mods |= SISrcMods::OP_SEL_1; + + return std::make_pair(Src, Mods); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PMods0(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } , // src_mods + // FIXME: Handle clamp and op_sel + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -0,0 +1,542 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s + +define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) { +; GFX9-LABEL: v_fmul_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %mul = fmul <2 x half> %a, %b + ret <2 x half> %mul +} + +define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) { +; GFX9-LABEL: v_fmul_v2f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v2f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <2 x half> %a + %mul = fmul <2 x half> %neg.a, %b + ret <2 x half> %mul +} + +define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) { +; GFX9-LABEL: v_fmul_v2f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v2f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.b = fneg <2 x half> %b + %mul = fmul <2 x half> %a, %neg.b + ret <2 x half> %mul +} + +define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) { +; GFX9-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <2 x half> %a + %neg.b = fneg <2 x half> %b + %mul = fmul <2 x half> %neg.a, %neg.b + ret <2 x half> %mul +} + +; FIXME +; define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) { +; %mul = fmul <3 x half> %a, %b +; ret <3 x half> %mul +; } + +; define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) { +; %neg.a = fneg <3 x half> %a +; %mul = fmul <3 x half> %neg.a, %b +; ret <3 x half> %mul +; } + +; define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) { +; %neg.b = fneg <3 x half> %b +; %mul = fmul <3 x half> %a, %neg.b +; ret <3 x half> %mul +; } + +; define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) { +; %neg.a = fneg <3 x half> %a +; %neg.b = fneg <3 x half> %b +; %mul = fmul <3 x half> %neg.a, %neg.b +; ret <3 x half> %mul +; } + +define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) { +; GFX9-LABEL: v_fmul_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %mul = fmul <4 x half> %a, %b + ret <4 x half> %mul +} + +define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) { +; GFX9-LABEL: v_fmul_v4f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v4f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <4 x half> %a + %mul = fmul <4 x half> %neg.a, %b + ret <4 x half> %mul +} + +define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) { +; GFX9-LABEL: v_fmul_v4f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v4f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.b = fneg <4 x half> %b + %mul = fmul <4 x half> %a, %neg.b + ret <4 x half> %mul +} + +define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b) { +; GFX9-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <4 x half> %a + %neg.b = fneg <4 x half> %b + %mul = fmul <4 x half> %neg.a, %neg.b + ret <4 x half> %mul +} + +define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) { +; GFX9-LABEL: v_fmul_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v6f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %mul = fmul <6 x half> %a, %b + ret <6 x half> %mul +} + +define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) { +; GFX9-LABEL: v_fmul_v6f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v6f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <6 x half> %a + %mul = fmul <6 x half> %neg.a, %b + ret <6 x half> %mul +} + +define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) { +; GFX9-LABEL: v_fmul_v6f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v6f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.b = fneg <6 x half> %b + %mul = fmul <6 x half> %a, %neg.b + ret <6 x half> %mul +} + +define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b) { +; GFX9-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <6 x half> %a + %neg.b = fneg <6 x half> %b + %mul = fmul <6 x half> %neg.a, %neg.b + ret <6 x half> %mul +} + +define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) { +; GFX9-LABEL: v_fmul_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 +; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v8f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %mul = fmul <8 x half> %a, %b + ret <8 x half> %mul +} + +define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) { +; GFX9-LABEL: v_fmul_v8f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v8f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <8 x half> %a + %mul = fmul <8 x half> %neg.a, %b + ret <8 x half> %mul +} + +define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) { +; GFX9-LABEL: v_fmul_v8f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v8f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.b = fneg <8 x half> %b + %mul = fmul <8 x half> %a, %neg.b + ret <8 x half> %mul +} + +define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b) { +; GFX9-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 neg_lo:[1,1] neg_hi:[1,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v4, s4, v4 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, s4, v5 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5 +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7 +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] + %neg.a = fneg <8 x half> %a + %neg.b = fneg <8 x half> %b + %mul = fmul <8 x half> %neg.a, %neg.b + ret <8 x half> %mul +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.v2s16.mir @@ -7,9 +7,6 @@ # ERR-NOT: remark # ERR-GFX910: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_ASHR %0:sgpr, %1:sgpr(<2 x s16>) (in function: ashr_v2s16_ss) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_ASHR %0:sgpr, %1:vgpr(<2 x s16>) (in function: ashr_v2s16_sv) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_ASHR %0:vgpr, %1:sgpr(<2 x s16>) (in function: ashr_v2s16_vs) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_ASHR %0:vgpr, %1:vgpr(<2 x s16>) (in function: ashr_v2s16_vv) # ERR-NOT: remark --- @@ -75,15 +72,16 @@ ; GFX8: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) ; GFX9-LABEL: name: ashr_v2s16_sv - ; GFX9: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]] ; GFX10-LABEL: name: ashr_v2s16_sv - ; GFX10: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]] %0:sgpr(<2 x s16>) = COPY $sgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr0 %2:vgpr(<2 x s16>) = G_ASHR %0, %1 @@ -114,15 +112,16 @@ ; GFX8: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) ; GFX9-LABEL: name: ashr_v2s16_vs - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX9: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]] ; GFX10-LABEL: name: ashr_v2s16_vs - ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX10: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:sgpr(<2 x s16>) = COPY $sgpr0 %2:vgpr(<2 x s16>) = G_ASHR %0, %1 @@ -153,15 +152,16 @@ ; GFX8: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) ; GFX9-LABEL: name: ashr_v2s16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]] ; GFX10-LABEL: name: ashr_v2s16_vv - ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX10: [[ASHR:%[0-9]+]]:vgpr(<2 x s16>) = G_ASHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[ASHR]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_PK_ASHRREV_I16_:%[0-9]+]]:vgpr_32 = V_PK_ASHRREV_I16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_ASHRREV_I16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_ASHR %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- @@ -108,9 +108,9 @@ liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_v2f16_denorm - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[FCANONICALIZE:%[0-9]+]]:vgpr(<2 x s16>) = G_FCANONICALIZE [[COPY]] - ; GFX9: S_ENDPGM 0, implicit [[FCANONICALIZE]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -131,9 +131,9 @@ liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_v2f16_flush - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[FCANONICALIZE:%[0-9]+]]:vgpr(<2 x s16>) = G_FCANONICALIZE [[COPY]] - ; GFX9: S_ENDPGM 0, implicit [[FCANONICALIZE]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 0, 15360, 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.v2s16.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fmaxnum_ieee_v2f16_vv @@ -10,10 +11,10 @@ liveins: $sgpr0, $sgpr1 ; GFX9-LABEL: name: fmaxnum_ieee_v2f16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[FMAXNUM_IEEE:%[0-9]+]]:vgpr(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]] - ; GFX9: S_ENDPGM 0, implicit [[FMAXNUM_IEEE]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_FMAXNUM_IEEE %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.v2s16.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # FIXME: Ideally this would fail to select with ieee mode enabled. --- @@ -11,10 +12,10 @@ liveins: $sgpr0, $sgpr1 ; GFX9-LABEL: name: fmaxnum_v2f16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[FMAXNUM:%[0-9]+]]:vgpr(<2 x s16>) = G_FMAXNUM [[COPY]], [[COPY1]] - ; GFX9: S_ENDPGM 0, implicit [[FMAXNUM]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MAX_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_FMAXNUM %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.v2s16.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fminnum_ieee_v2f16_vv @@ -11,10 +11,10 @@ liveins: $sgpr0, $sgpr1 ; GFX9-LABEL: name: fminnum_ieee_v2f16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[FMINNUM_IEEE:%[0-9]+]]:vgpr(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]] - ; GFX9: S_ENDPGM 0, implicit [[FMINNUM_IEEE]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MIN_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_FMINNUM_IEEE %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.v2s16.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s --- name: fminnum_v2f16_vv @@ -11,10 +11,10 @@ liveins: $sgpr0, $sgpr1 ; GFX9-LABEL: name: fminnum_v2f16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[FMINNUM:%[0-9]+]]:vgpr(<2 x s16>) = G_FMINNUM [[COPY]], [[COPY1]] - ; GFX9: S_ENDPGM 0, implicit [[FMINNUM]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_MIN_F16_:%[0-9]+]]:vgpr_32 = V_PK_MIN_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MIN_F16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_FMINNUM %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.v2s16.mir @@ -0,0 +1,74 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s + +--- +name: fmul_v2f16_vv +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX9-LABEL: name: fmul_v2f16_vv + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 8, [[COPY]], 8, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]] + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = COPY $vgpr1 + %2:vgpr(<2 x s16>) = G_FMUL %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: fmul_v2f16_fneg_v_fneg_v +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX9-LABEL: name: fmul_v2f16_fneg_v_fneg_v + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32 = V_PK_MUL_F16 11, [[COPY]], 11, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]] + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = COPY $vgpr1 + %2:vgpr(<2 x s16>) = G_FNEG %0 + %3:vgpr(<2 x s16>) = G_FNEG %1 + %4:vgpr(<2 x s16>) = G_FMUL %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: fmul_v2f16_fneg_lo_v_v +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: fmul_v2f16_fneg_lo_v_v + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32(<2 x s16>) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 + ; GFX9: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[FNEG:%[0-9]+]]:vgpr(s16) = G_FNEG [[TRUNC]] + ; GFX9: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[FNEG]](s16) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:vgpr_32(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[COPY2]](s32) + ; GFX9: [[V_PK_MUL_F16_:%[0-9]+]]:vgpr_32(<2 x s16>) = V_PK_MUL_F16 8, [[BUILD_VECTOR_TRUNC]](<2 x s16>), 8, [[COPY]](<2 x s16>), 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_MUL_F16_]](<2 x s16>) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s16) = G_TRUNC %1 + %4:vgpr(s16) = G_FNEG %3 + %5:vgpr(s32) = G_ANYEXT %4 + %6:vgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %5, %2 + %7:vgpr(<2 x s16>) = G_FMUL %6, %0 + S_ENDPGM 0, implicit %7 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.v2s16.mir @@ -7,9 +7,6 @@ # ERR-NOT: remark # ERR-GFX910: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_LSHR %0:sgpr, %1:sgpr(<2 x s16>) (in function: lshr_v2s16_ss) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_LSHR %0:sgpr, %1:vgpr(<2 x s16>) (in function: lshr_v2s16_sv) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_LSHR %0:vgpr, %1:sgpr(<2 x s16>) (in function: lshr_v2s16_vs) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_LSHR %0:vgpr, %1:vgpr(<2 x s16>) (in function: lshr_v2s16_vv) # ERR-NOT: remark --- @@ -75,15 +72,16 @@ ; GFX8: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) ; GFX9-LABEL: name: lshr_v2s16_sv - ; GFX9: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]] ; GFX10-LABEL: name: lshr_v2s16_sv - ; GFX10: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]] %0:sgpr(<2 x s16>) = COPY $sgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr0 %2:vgpr(<2 x s16>) = G_LSHR %0, %1 @@ -114,15 +112,16 @@ ; GFX8: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) ; GFX9-LABEL: name: lshr_v2s16_vs - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX9: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]] ; GFX10-LABEL: name: lshr_v2s16_vs - ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX10: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:sgpr(<2 x s16>) = COPY $sgpr0 %2:vgpr(<2 x s16>) = G_LSHR %0, %1 @@ -153,15 +152,16 @@ ; GFX8: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) ; GFX9-LABEL: name: lshr_v2s16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]] ; GFX10-LABEL: name: lshr_v2s16_vv - ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX10: [[LSHR:%[0-9]+]]:vgpr(<2 x s16>) = G_LSHR [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[LSHR]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_PK_LSHRREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHRREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHRREV_B16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_LSHR %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.v2s16.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX9 %s # RUN: FileCheck -check-prefixes=ERR-GFX910,ERR %s < %t @@ -6,9 +7,6 @@ # ERR-NOT: remark # ERR-GFX910: remark: :0:0: cannot select: %2:sgpr(<2 x s16>) = G_SHL %0:sgpr, %1:sgpr(<2 x s16>) (in function: shl_v2s16_ss) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_SHL %0:sgpr, %1:vgpr(<2 x s16>) (in function: shl_v2s16_sv) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_SHL %0:vgpr, %1:sgpr(<2 x s16>) (in function: shl_v2s16_vs) -# ERR-GFX910-NEXT: remark: :0:0: cannot select: %2:vgpr(<2 x s16>) = G_SHL %0:vgpr, %1:vgpr(<2 x s16>) (in function: shl_v2s16_vv) # ERR-NOT: remark --- @@ -74,15 +72,16 @@ ; GFX8: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) ; GFX9-LABEL: name: shl_v2s16_sv - ; GFX9: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]] ; GFX10-LABEL: name: shl_v2s16_sv - ; GFX10: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]] %0:sgpr(<2 x s16>) = COPY $sgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr0 %2:vgpr(<2 x s16>) = G_SHL %0, %1 @@ -113,15 +112,16 @@ ; GFX8: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) ; GFX9-LABEL: name: shl_v2s16_vs - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX9: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]] ; GFX10-LABEL: name: shl_v2s16_vs - ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0 - ; GFX10: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:sgpr(<2 x s16>) = COPY $sgpr0 %2:vgpr(<2 x s16>) = G_SHL %0, %1 @@ -152,15 +152,16 @@ ; GFX8: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) ; GFX8: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) ; GFX9-LABEL: name: shl_v2s16_vv - ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX9: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) - ; GFX9: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]] ; GFX10-LABEL: name: shl_v2s16_vv - ; GFX10: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr1 - ; GFX10: [[SHL:%[0-9]+]]:vgpr(<2 x s16>) = G_SHL [[COPY]], [[COPY1]](<2 x s16>) - ; GFX10: S_ENDPGM 0, implicit [[SHL]](<2 x s16>) + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[V_PK_LSHLREV_B16_:%[0-9]+]]:vgpr_32 = V_PK_LSHLREV_B16 8, [[COPY1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_PK_LSHLREV_B16_]] %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(<2 x s16>) = COPY $vgpr1 %2:vgpr(<2 x s16>) = G_SHL %0, %1