diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3322,6 +3322,131 @@ }}; } +// Strip G_BITCAST and G_FNEG and return the source register of the result in +// OutSrc. If any G_FNEG were encountered, xor the value in FNeg. +static void stripBitcastAndFNeg(Register Src, Register &OutSrc, + const MachineRegisterInfo &MRI, bool &FNeg) { + MachineInstr *MI = MRI.getVRegDef(Src); + while (MI->getOpcode() == AMDGPU::G_BITCAST || + MI->getOpcode() == AMDGPU::G_FNEG) { + if (MI->getOpcode() == AMDGPU::G_FNEG) + FNeg ^= true; + OutSrc = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(OutSrc); + } +} + +// Figure out if the source register is actually the high 16-bit of another +// register. +static bool isHiElt(Register Src, Register &OutSrc, + const MachineRegisterInfo &MRI, bool &FNeg) { + bool CurrentFNeg = false; + stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg); + MachineInstr *MI = MRI.getVRegDef(Src); + + // Strip EXT instructions + if (MI->getOpcode() == AMDGPU::G_ANYEXT || + MI->getOpcode() == AMDGPU::G_ZEXT || MI->getOpcode() == AMDGPU::G_SEXT) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // G_BITCAST or G_FNEG can appear after extensions so look to strip them + // before continuing. + stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg); + MI = MRI.getVRegDef(Src); + + if (MI->getOpcode() != AMDGPU::G_TRUNC && MI->getOpcode() != AMDGPU::G_LSHR) + return false; + + if (MI->getOpcode() == AMDGPU::G_TRUNC) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + if (MI->getOpcode() == AMDGPU::G_LSHR) { + Register ShiftAmt = MI->getOperand(2).getReg(); + auto MaybeImmVal = getConstantVRegValWithLookThrough(ShiftAmt, MRI); + if (MaybeImmVal && MaybeImmVal->Value.getSExtValue() == 16) { + OutSrc = MI->getOperand(1).getReg(); + FNeg ^= CurrentFNeg; + return true; + } + } + + return false; +} + +// Determine if we are looking at the low 16-bit of a dword register. +static void stripTruncLoElt(Register Src, Register &OutSrc, + const MachineRegisterInfo &MRI, bool &FNeg) { + bool CurrentFNeg = false; + stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg); + MachineInstr *MI = MRI.getVRegDef(Src); + + // Strip EXT instructions + if (MI->getOpcode() == AMDGPU::G_ANYEXT || + MI->getOpcode() == AMDGPU::G_ZEXT || MI->getOpcode() == AMDGPU::G_SEXT) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // G_BITCAST or G_FNEG can appear after extensions so look to strip them + // before continuing. + stripBitcastAndFNeg(Src, Src, MRI, CurrentFNeg); + MI = MRI.getVRegDef(Src); + + if (MI->getOpcode() == AMDGPU::G_TRUNC) { + Register DstReg = MI->getOperand(1).getReg(); + const LLT Ty = MRI.getType(DstReg); + if (Ty.getSizeInBits() == 32) { + FNeg ^= CurrentFNeg; + OutSrc = MI->getOperand(1).getReg(); + } + } +} + +// Determine if the instruction pattern matches that of a G_BUILD_VECTOR_TRUNC +// that has been lowered. If so then return true and return the sources in +// LoSrcOut and HiSrcOut. +static bool isBuildVectorTrunc(MachineInstr *MI, Register &LoSrcOut, + Register &HiSrcOut, + const MachineRegisterInfo &MRI) { + // Strip G_BITCAST + Register Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + + // After G_BITCAST should be a G_OR + if (MI->getOpcode() == AMDGPU::G_OR) { + Register LoSrc = MI->getOperand(1).getReg(); + Register HiSrc = MI->getOperand(2).getReg(); + + MachineInstr *LoMI = MRI.getVRegDef(LoSrc); + MachineInstr *HiMI = MRI.getVRegDef(HiSrc); + + // G_OR operands should be result from a G_AND 0xffff (Lo) and a G_SHL 16 + // (Hi) + if (LoMI->getOpcode() == AMDGPU::G_AND && + HiMI->getOpcode() == AMDGPU::G_SHL) { + Register AndMask = LoMI->getOperand(2).getReg(); + auto MaybeImmVal1 = getConstantVRegValWithLookThrough(AndMask, MRI); + Register ShiftAmt = HiMI->getOperand(2).getReg(); + auto MaybeImmVal2 = getConstantVRegValWithLookThrough(ShiftAmt, MRI); + + if (MaybeImmVal1 && MaybeImmVal1->Value.getSExtValue() == 0xffff && + MaybeImmVal2 && MaybeImmVal2->Value.getSExtValue() == 16) { + // Pattern matches a G_BUILD_VECTOR_TRUNC, return source registers for + // Lo and Hi sources. + LoSrcOut = LoMI->getOperand(1).getReg(); + HiSrcOut = HiMI->getOperand(1).getReg(); + return true; + } + } + } + + return false; +} + std::pair AMDGPUInstructionSelector::selectVOP3PModsImpl( Register Src, const MachineRegisterInfo &MRI) const { @@ -3337,7 +3462,114 @@ MI = MRI.getVRegDef(Src); } - // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + Register VecSrc = Src; + unsigned VecMods = Mods; + + SmallVector Mask = {0, 1}; + // Match op_sel through G_SHUFFLE_VECTOR or set mask values if a possibiliy + // of a G_BUILD_VECTOR_TRUNC is detected. + if (MI->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { + ArrayRef ShufMask = MI->getOperand(3).getShuffleMask(); + assert(ShufMask.size() == 2); + assert(ShufMask[0] != -1 && ShufMask[1] != -1); + + // Set mask values for G_SHUFFLE_VECTOR + Mask[0] = ShufMask[0]; + Mask[1] = ShufMask[1]; + + // Strip G_SHUFFLE_VECTOR + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + + // Strip any G_FNEG before a potential G_BUILD_VECTOR_TRUNC + if (MI->getOpcode() == AMDGPU::G_FNEG) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // Lowering of G_BUILD_VECTOR_TRUNC always insert a G_BITCAST. If we do not + // see it then do not look any further and just set op_sel based on the + // shuffle mask. + if (MI->getOpcode() != AMDGPU::G_BITCAST) { + // Add op_sel modifiers based on the shuffle mask. + if (Mask[0] == Mask[1] && Mask[0] == 1) + // ShuffleMask of (1,1). Both select are for high 16 bit + Mods |= (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + else if (Mask[0] == 1) + // ShuffleMask of (1,0). First select is for high 16 bit while second + // select is for low 16 bit + Mods |= SISrcMods::OP_SEL_0; + else + // ShuffleMask of (0,1). First select is for low 16 bit while second + // select is for high 16 bit + Mods |= SISrcMods::OP_SEL_1; + + return std::make_pair(Src, Mods); + } + } + + // Match op_sel through G_BUILD_VECTOR_TRUNC which always insert a G_BITCAST + if (MI && MI->getOpcode() == AMDGPU::G_BITCAST) { + Register LoSrc; + Register HiSrc; + // Look for the pattern of G_BUILD_VECTOR_TRUNC and return the registers of + // the source elements + if (isBuildVectorTrunc(MI, LoSrc, HiSrc, MRI)) { + // In case of G_SHUFFLE_VECTOR, use the mask to select the Lo and Hi MIs. + // Default mask is (0,1). + MachineInstr *LoMI = + Mask[0] == 0 ? MRI.getVRegDef(LoSrc) : MRI.getVRegDef(HiSrc); + MachineInstr *HiMI = + Mask[1] == 1 ? MRI.getVRegDef(HiSrc) : MRI.getVRegDef(LoSrc); + + // Update Lo and Hi source registers + LoSrc = LoMI->getOperand(0).getReg(); + HiSrc = HiMI->getOperand(0).getReg(); + + // Keep track of G_FNEG modifiers when we strip them. + bool FNegLo = false; + bool FNegHi = false; + + // Determine if LoSrc is actually from a high 16-bit source + if (isHiElt(LoSrc, LoSrc, MRI, FNegLo)) + Mods |= SISrcMods::OP_SEL_0; + + // Determine if HiSrc is actually from a high 16-bit source + if (isHiElt(HiSrc, HiSrc, MRI, FNegHi)) + Mods |= SISrcMods::OP_SEL_1; + + stripTruncLoElt(LoSrc, LoSrc, MRI, FNegLo); + + stripTruncLoElt(HiSrc, HiSrc, MRI, FNegHi); + + // Final strip of G_BITCASTs and G_FNEGs + stripBitcastAndFNeg(LoSrc, LoSrc, MRI, FNegLo); + stripBitcastAndFNeg(HiSrc, HiSrc, MRI, FNegHi); + + // Apply G_FNEG modifiers + if (FNegLo) + Mods ^= SISrcMods::NEG; + if (FNegHi) + Mods ^= SISrcMods::NEG_HI; + + LoMI = MRI.getVRegDef(LoSrc); + HiMI = MRI.getVRegDef(HiSrc); + if (LoMI == HiMI) { + Src = LoSrc; + return std::make_pair(Src, Mods); + } + } else { + bool FNeg = false; + stripBitcastAndFNeg(Src, Src, MRI, FNeg); + if (FNeg) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + + return std::make_pair(Src, Mods); + } + } + Src = VecSrc; + Mods = VecMods; // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -272,23 +272,20 @@ ; GFX906-LABEL: v_sdot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -300,23 +297,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +321,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -114,16 +114,14 @@ ; GFX906-LABEL: v_sdot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot4_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll @@ -70,16 +70,14 @@ ; GFX906-LABEL: v_sdot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_i32_i4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -272,23 +272,20 @@ ; GFX906-LABEL: v_udot2_fnegv2f16_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_fnegv2f16_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_fnegv2f16_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg <2 x half> %c %cast.neg.c = bitcast <2 x half> %neg.c to i32 @@ -300,23 +297,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +321,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -114,16 +114,14 @@ ; GFX906-LABEL: v_udot4_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX906-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot4_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 +; GFX10-NEXT: v_dot4_u32_u8 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll @@ -70,16 +70,14 @@ ; GFX906-LABEL: v_udot8_fnegv2f16_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX906-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot8_fnegv2f16_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 +; GFX10-NEXT: v_dot8_u32_u4 v0, v0, v1, v2 op_sel_hi:[0,1,1] neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX10-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %cast.neg.a = bitcast <2 x half> %neg.a to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll @@ -0,0 +1,728 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +define amdgpu_kernel void @fma_vector_vector_scalar_lo_no_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_lo_no_shuffle: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %scalar0.vec1 = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.vec2 = insertelement <2 x half> %scalar0.vec1, half %scalar0, i32 1 + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.vec2) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + %neg.scalar0.broadcast = fneg <2 x half> %scalar0.broadcast + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + %neg.neg.scalar0.broadcast = fneg <2 x half> %neg.scalar0.broadcast + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_neg_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 + %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.bc = bitcast half %neg.scalar0 to i16 + + %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer + + %result = add <2 x i16> %vec0, %neg.scalar0.broadcast + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_neg_vector_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.fneg = fneg <2 x half> %vec2 + %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_vector_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1 = extractelement <2 x half> %vec2, i32 1 + %neg.vec2.elt1 = fneg half %vec2.elt1 + + %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { +; GCN-LABEL: add_vector_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4 + + %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> + %result = add <2 x i16> %vec0, %vec1.elt1.broadcast + + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_neg_vector_lo_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %neg.vec2 = fneg <2 x half> %vec2 + %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1 + %neg.neg.vec2.elt1 = fneg half %neg.vec2.elt1 + %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1 + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_swap_vector: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_swap_neg_vector: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + + %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_2: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_3: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: bitcast_fneg_f32: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fneg float %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %result = fadd <2 x half> %vec0, %bc + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: shuffle_bitcast_fneg_f32: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fneg float %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> + %result = fadd <2 x half> %vec0, %shuf + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: bitcast_lo_elt_op_sel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %fadd = fadd <2 x half> %vec2, + %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: mix_elt_types_op_sel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; kill: killed $vgpr0_vgpr1 +; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %scalar1 = load volatile half, half addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0 + + %fadd = fadd <2 x half> %vec2, + %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +