diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3322,6 +3322,136 @@ }}; } +// Strip G_BITCAST and G_FNEG and return the source register of the result. If +// any G_FNEG were encountered, xor the value in FNeg. +static Register +stripBitcastAndFNeg(Register Src, const MachineRegisterInfo &MRI, bool &FNeg) { + LLT Ty = MRI.getType(Src); + + MachineInstr *MI = MRI.getVRegDef(Src); + while (MI->getOpcode() == AMDGPU::G_BITCAST || + // We can only strip the FNEG of packed components or individual 16-bit + // component + (MI->getOpcode() == AMDGPU::G_FNEG && + (Ty == LLT::vector(2, 16) || Ty.getSizeInBits() == 16))) { + if (MI->getOpcode() == AMDGPU::G_FNEG) + FNeg ^= true; + + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + Ty = MRI.getType(Src); + } + + return Src; +} + +// Figure out if the source register is actually the high 16-bit of another +// register. +static Register isHiElt(Register Src, const MachineRegisterInfo &MRI, + bool &FNeg) { + bool CurrentFNeg = false; + Src = stripBitcastAndFNeg(Src, MRI, CurrentFNeg); + MachineInstr *MI = MRI.getVRegDef(Src); + + // Strip EXT instructions + if (MI->getOpcode() == AMDGPU::G_ANYEXT || + MI->getOpcode() == AMDGPU::G_ZEXT || MI->getOpcode() == AMDGPU::G_SEXT) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // G_BITCAST or G_FNEG can appear after extensions so look to strip them + // before continuing. + Src = stripBitcastAndFNeg(Src, MRI, CurrentFNeg); + MI = MRI.getVRegDef(Src); + + if (MI->getOpcode() == AMDGPU::G_TRUNC) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + if (MI->getOpcode() == AMDGPU::G_LSHR) { + Register ShiftAmt = MI->getOperand(2).getReg(); + auto MaybeImmVal = getConstantVRegValWithLookThrough(ShiftAmt, MRI); + if (MaybeImmVal && MaybeImmVal->Value.getSExtValue() == 16) { + FNeg ^= CurrentFNeg; + return MI->getOperand(1).getReg(); + } + } + + return MCRegister::NoRegister; +} + +// Determine if we are looking at the low 16-bit of a dword register. +static Register stripTruncLoElt(Register Src, const MachineRegisterInfo &MRI, + bool &FNeg) { + bool CurrentFNeg = false; + Src = stripBitcastAndFNeg(Src, MRI, CurrentFNeg); + MachineInstr *MI = MRI.getVRegDef(Src); + + // Strip EXT instructions + if (MI->getOpcode() == AMDGPU::G_ANYEXT || + MI->getOpcode() == AMDGPU::G_ZEXT || MI->getOpcode() == AMDGPU::G_SEXT) { + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // G_BITCAST or G_FNEG can appear after extensions so look to strip them + // before continuing. + Src = stripBitcastAndFNeg(Src, MRI, CurrentFNeg); + MI = MRI.getVRegDef(Src); + + if (MI->getOpcode() == AMDGPU::G_TRUNC) { + Register DstReg = MI->getOperand(1).getReg(); + const LLT Ty = MRI.getType(DstReg); + if (Ty.getSizeInBits() == 32) { + FNeg ^= CurrentFNeg; + return MI->getOperand(1).getReg(); + } + } + + return MCRegister::NoRegister; +} + +// Determine if the instruction pattern matches that of a G_BUILD_VECTOR_TRUNC +// that has been lowered. If so then return the two source registers. +static std::pair +isBuildVectorTrunc(MachineInstr *MI, const MachineRegisterInfo &MRI) { + // Strip G_BITCAST + assert(MI->getOpcode() == AMDGPU::G_BITCAST); + Register Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + + // After G_BITCAST should be a G_OR + if (MI->getOpcode() == AMDGPU::G_OR) { + Register LoSrc = MI->getOperand(1).getReg(); + Register HiSrc = MI->getOperand(2).getReg(); + + MachineInstr *LoMI = MRI.getVRegDef(LoSrc); + MachineInstr *HiMI = MRI.getVRegDef(HiSrc); + + // G_OR operands should be result from a G_AND 0xffff (Lo) and a G_SHL 16 + // (Hi) + if (LoMI->getOpcode() == AMDGPU::G_AND && + HiMI->getOpcode() == AMDGPU::G_SHL) { + Register AndMask = LoMI->getOperand(2).getReg(); + auto MaybeImmVal1 = getConstantVRegValWithLookThrough(AndMask, MRI); + Register ShiftAmt = HiMI->getOperand(2).getReg(); + auto MaybeImmVal2 = getConstantVRegValWithLookThrough(ShiftAmt, MRI); + + if (MaybeImmVal1 && MaybeImmVal1->Value.getSExtValue() == 0xffff && + MaybeImmVal2 && MaybeImmVal2->Value.getSExtValue() == 16) { + // Pattern matches a G_BUILD_VECTOR_TRUNC, return source registers for + // Lo and Hi sources. + return std::make_pair(LoMI->getOperand(1).getReg(), + HiMI->getOperand(1).getReg()); + } + } + } + + return std::make_pair(MCRegister::NoRegister, MCRegister::NoRegister); +} + std::pair AMDGPUInstructionSelector::selectVOP3PModsImpl( Register Src, const MachineRegisterInfo &MRI) const { @@ -3337,7 +3467,128 @@ MI = MRI.getVRegDef(Src); } - // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + Register VecSrc = Src; + unsigned VecMods = Mods; + + int Mask[2] = {0, 1}; + // Match op_sel through G_SHUFFLE_VECTOR or set mask values if a possibiliy + // of a G_BUILD_VECTOR_TRUNC is detected. + if (MI->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) { + ArrayRef ShufMask = MI->getOperand(3).getShuffleMask(); + assert(ShufMask.size() == 2); + assert(ShufMask[0] != -1 && ShufMask[1] != -1); + + // Set mask values for G_SHUFFLE_VECTOR + Mask[0] = ShufMask[0]; + Mask[1] = ShufMask[1]; + + // Strip G_SHUFFLE_VECTOR + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + + // Strip any G_FNEG before a potential G_BUILD_VECTOR_TRUNC + if (MI->getOpcode() == AMDGPU::G_FNEG) { + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); + } + + // Lowering of G_BUILD_VECTOR_TRUNC always insert a G_BITCAST. If we do not + // see it then do not look any further and just set op_sel based on the + // shuffle mask. + if (MI->getOpcode() != AMDGPU::G_BITCAST) { + // Add op_sel modifiers based on the shuffle mask. + if (Mask[0] == Mask[1] && Mask[0] == 1) + // ShuffleMask of (1,1). Both select are for high 16 bit + Mods |= (SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + else if (Mask[0] == 1) + // ShuffleMask of (1,0). First select is for high 16 bit while second + // select is for low 16 bit + Mods |= SISrcMods::OP_SEL_0; + else + // ShuffleMask of (0,1). First select is for low 16 bit while second + // select is for high 16 bit + Mods |= SISrcMods::OP_SEL_1; + + return std::make_pair(Src, Mods); + } + } + + // Match op_sel through G_BUILD_VECTOR_TRUNC which always insert a G_BITCAST + if (MI && MI->getOpcode() == AMDGPU::G_BITCAST) { + Register LoSrc; + Register HiSrc; + + // Look for the pattern of G_BUILD_VECTOR_TRUNC and return the registers of + // the source elements + std::tie(LoSrc, HiSrc) = isBuildVectorTrunc(MI, MRI); + if (LoSrc && HiSrc) { + // In case of G_SHUFFLE_VECTOR, use the mask to select the Lo and Hi MIs. + // Default mask is (0,1). + MachineInstr *LoMI = + Mask[0] == 0 ? MRI.getVRegDef(LoSrc) : MRI.getVRegDef(HiSrc); + MachineInstr *HiMI = + Mask[1] == 1 ? MRI.getVRegDef(HiSrc) : MRI.getVRegDef(LoSrc); + + // Update Lo and Hi source registers + LoSrc = LoMI->getOperand(0).getReg(); + HiSrc = HiMI->getOperand(0).getReg(); + + // Keep track of G_FNEG modifiers when we strip them. + bool FNegLo = false; + bool FNegHi = false; + + // Determine if LoSrc is actually from a high 16-bit source + if (auto HiEltReg = isHiElt(LoSrc, MRI, FNegLo)) { + Mods |= SISrcMods::OP_SEL_0; + LoSrc = HiEltReg; + } + + // Determine if HiSrc is actually from a high 16-bit source + if (auto HiEltReg = isHiElt(HiSrc, MRI, FNegHi)) { + Mods |= SISrcMods::OP_SEL_1; + HiSrc = HiEltReg; + } + + if (auto LoEltReg = stripTruncLoElt(LoSrc, MRI, FNegLo)) + LoSrc = LoEltReg; + + if (auto LoEltReg = stripTruncLoElt(HiSrc, MRI, FNegHi)) + HiSrc = LoEltReg; + + // Final strip of G_BITCASTs and G_FNEGs + LoSrc = stripBitcastAndFNeg(LoSrc, MRI, FNegLo); + HiSrc = stripBitcastAndFNeg(HiSrc, MRI, FNegHi); + + // Apply G_FNEG modifiers + if (FNegLo) + Mods ^= SISrcMods::NEG; + if (FNegHi) + Mods ^= SISrcMods::NEG_HI; + + LoMI = MRI.getVRegDef(LoSrc); + HiMI = MRI.getVRegDef(HiSrc); + if (LoMI == HiMI) { + Src = LoSrc; + return std::make_pair(Src, Mods); + } + } else if (MRI.getType(Src) == LLT::vector(2, 16)) { + bool FNeg = false; + Src = stripBitcastAndFNeg(Src, MRI, FNeg); + if (FNeg) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + + // Account for G_SHUFFLE_VECTOR changing the mask and apply op_sel mod. + if (Mask[0] == 1) + Mods |= SISrcMods::OP_SEL_0; + if (Mask[1] == 1) + Mods |= SISrcMods::OP_SEL_1; + + return std::make_pair(Src, Mods); + } + } + Src = VecSrc; + Mods = VecMods; // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-op-sel.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-op-sel.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-op-sel.mir @@ -0,0 +1,1467 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +--- +name: fma_vector_vector_scalar_lo_no_shuffle +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_scalar_lo_no_shuffle + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %25:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 0, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %25, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %33:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %33:vgpr(p3) :: (volatile load 4, addrspace 3) + %34:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %34:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %35:vgpr(p3) = COPY %19:sgpr(p3) + %31:vgpr(s32) = G_LOAD %35:vgpr(p3) :: (volatile load 2, addrspace 3) + %36:vgpr(s32) = G_CONSTANT i32 65535 + %37:vgpr(s32) = G_CONSTANT i32 16 + %38:vgpr(s32) = G_SHL %31:vgpr, %37:vgpr(s32) + %39:vgpr(s32) = G_AND %31:vgpr, %36:vgpr + %40:vgpr(s32) = G_OR %39:vgpr, %38:vgpr + %29:vgpr(<2 x s16>) = G_BITCAST %40:vgpr(s32) + %30:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %29:vgpr + %41:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %30:vgpr(<2 x s16>), %41:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_scalar_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_scalar_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %29:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 0, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %29, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %28:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %35:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %35:vgpr(p3) :: (volatile load 4, addrspace 3) + %36:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %37:vgpr(p3) = COPY %19:sgpr(p3) + %32:vgpr(s32) = G_LOAD %37:vgpr(p3) :: (volatile load 2, addrspace 3) + %34:sgpr(s32) = G_IMPLICIT_DEF + %38:vgpr(s32) = G_CONSTANT i32 65535 + %39:sgpr(s32) = G_CONSTANT i32 16 + %40:sgpr(s32) = G_SHL %34:sgpr, %39:sgpr(s32) + %41:vgpr(s32) = G_AND %32:vgpr, %38:vgpr + %42:vgpr(s32) = G_OR %41:vgpr, %40:sgpr + %27:vgpr(<2 x s16>) = G_BITCAST %42:vgpr(s32) + %43:vgpr(<2 x s16>) = COPY %28:sgpr(<2 x s16>) + %29:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %27:vgpr(<2 x s16>), %43:vgpr, shufflemask(0, 0) + %30:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %29:vgpr + %44:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %30:vgpr(<2 x s16>), %44:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_neg_broadcast_scalar_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_neg_broadcast_scalar_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %30:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 3, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %30, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %28:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %36:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %37:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %37:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %38:vgpr(p3) = COPY %19:sgpr(p3) + %33:vgpr(s32) = G_LOAD %38:vgpr(p3) :: (volatile load 2, addrspace 3) + %35:sgpr(s32) = G_IMPLICIT_DEF + %39:vgpr(s32) = G_CONSTANT i32 65535 + %40:sgpr(s32) = G_CONSTANT i32 16 + %41:sgpr(s32) = G_SHL %35:sgpr, %40:sgpr(s32) + %42:vgpr(s32) = G_AND %33:vgpr, %39:vgpr + %43:vgpr(s32) = G_OR %42:vgpr, %41:sgpr + %27:vgpr(<2 x s16>) = G_BITCAST %43:vgpr(s32) + %44:vgpr(<2 x s16>) = COPY %28:sgpr(<2 x s16>) + %29:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %27:vgpr(<2 x s16>), %44:vgpr, shufflemask(0, 0) + %30:vgpr(<2 x s16>) = G_FNEG %29:vgpr + %31:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %30:vgpr + %45:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %31:vgpr(<2 x s16>), %45:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_neg_scalar_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_neg_scalar_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %32:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 3, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %32, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %29:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %36:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %37:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %37:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %38:vgpr(p3) = COPY %19:sgpr(p3) + %33:vgpr(s32) = G_LOAD %38:vgpr(p3) :: (volatile load 2, addrspace 3) + %26:vgpr(s16) = G_TRUNC %33:vgpr(s32) + %27:vgpr(s16) = G_FNEG %26:vgpr + %34:vgpr(s32) = G_ANYEXT %27:vgpr(s16) + %35:sgpr(s32) = G_IMPLICIT_DEF + %39:vgpr(s32) = G_CONSTANT i32 65535 + %40:sgpr(s32) = G_CONSTANT i32 16 + %41:sgpr(s32) = G_SHL %35:sgpr, %40:sgpr(s32) + %42:vgpr(s32) = G_AND %34:vgpr, %39:vgpr + %43:vgpr(s32) = G_OR %42:vgpr, %41:sgpr + %28:vgpr(<2 x s16>) = G_BITCAST %43:vgpr(s32) + %44:vgpr(<2 x s16>) = COPY %29:sgpr(<2 x s16>) + %30:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %28:vgpr(<2 x s16>), %44:vgpr, shufflemask(0, 0) + %31:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %30:vgpr + %45:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %31:vgpr(<2 x s16>), %45:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_neg_broadcast_neg_scalar_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_neg_broadcast_neg_scalar_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %33:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 0, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %33, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %29:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %37:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %37:vgpr(p3) :: (volatile load 4, addrspace 3) + %38:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %38:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %39:vgpr(p3) = COPY %19:sgpr(p3) + %34:vgpr(s32) = G_LOAD %39:vgpr(p3) :: (volatile load 2, addrspace 3) + %26:vgpr(s16) = G_TRUNC %34:vgpr(s32) + %27:vgpr(s16) = G_FNEG %26:vgpr + %35:vgpr(s32) = G_ANYEXT %27:vgpr(s16) + %36:sgpr(s32) = G_IMPLICIT_DEF + %40:vgpr(s32) = G_CONSTANT i32 65535 + %41:sgpr(s32) = G_CONSTANT i32 16 + %42:sgpr(s32) = G_SHL %36:sgpr, %41:sgpr(s32) + %43:vgpr(s32) = G_AND %35:vgpr, %40:vgpr + %44:vgpr(s32) = G_OR %43:vgpr, %42:sgpr + %28:vgpr(<2 x s16>) = G_BITCAST %44:vgpr(s32) + %45:vgpr(<2 x s16>) = COPY %29:sgpr(<2 x s16>) + %30:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %28:vgpr(<2 x s16>), %45:vgpr, shufflemask(0, 0) + %31:vgpr(<2 x s16>) = G_FNEG %30:vgpr + %32:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %31:vgpr + %46:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %32:vgpr(<2 x s16>), %46:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_scalar_neg_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_scalar_neg_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %28:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 1, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %28, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %35:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %35:vgpr(p3) :: (volatile load 4, addrspace 3) + %36:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %37:vgpr(p3) = COPY %19:sgpr(p3) + %32:vgpr(s32) = G_LOAD %37:vgpr(p3) :: (volatile load 2, addrspace 3) + %26:vgpr(s16) = G_TRUNC %32:vgpr(s32) + %27:vgpr(s16) = G_FNEG %26:vgpr + %33:vgpr(s32) = G_ANYEXT %27:vgpr(s16) + %38:vgpr(s32) = G_CONSTANT i32 65535 + %39:vgpr(s32) = G_CONSTANT i32 16 + %40:vgpr(s32) = G_SHL %32:vgpr, %39:vgpr(s32) + %41:vgpr(s32) = G_AND %33:vgpr, %38:vgpr + %42:vgpr(s32) = G_OR %41:vgpr, %40:vgpr + %30:vgpr(<2 x s16>) = G_BITCAST %42:vgpr(s32) + %31:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %30:vgpr + %43:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %31:vgpr(<2 x s16>), %43:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_scalar_neg_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_scalar_neg_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: %28:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 2, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %28, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %35:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %35:vgpr(p3) :: (volatile load 4, addrspace 3) + %36:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %37:vgpr(p3) = COPY %19:sgpr(p3) + %32:vgpr(s32) = G_LOAD %37:vgpr(p3) :: (volatile load 2, addrspace 3) + %26:vgpr(s16) = G_TRUNC %32:vgpr(s32) + %27:vgpr(s16) = G_FNEG %26:vgpr + %34:vgpr(s32) = G_ANYEXT %27:vgpr(s16) + %38:vgpr(s32) = G_CONSTANT i32 65535 + %39:vgpr(s32) = G_CONSTANT i32 16 + %40:vgpr(s32) = G_SHL %34:vgpr, %39:vgpr(s32) + %41:vgpr(s32) = G_AND %32:vgpr, %38:vgpr + %42:vgpr(s32) = G_OR %41:vgpr, %40:vgpr + %30:vgpr(<2 x s16>) = G_BITCAST %42:vgpr(s32) + %31:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %30:vgpr + %43:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %31:vgpr(<2 x s16>), %43:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: add_vector_neg_bitcast_scalar_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: add_vector_neg_bitcast_scalar_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY4]], 0, 0, implicit $exec :: (volatile load 2, addrspace 3) + ; GFX9: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[DS_READ_B32_gfx9_]], 3, [[DS_READ_U16_gfx9_]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[V_PK_ADD_U16_]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %26:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %33:vgpr(p3) = COPY %16:sgpr(p3) + %22:vgpr(<2 x s16>) = G_LOAD %33:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %34:vgpr(p3) = COPY %19:sgpr(p3) + %30:vgpr(s32) = G_LOAD %34:vgpr(p3) :: (volatile load 2, addrspace 3) + %23:vgpr(s16) = G_TRUNC %30:vgpr(s32) + %24:vgpr(s16) = G_FNEG %23:vgpr + %31:vgpr(s32) = G_ANYEXT %24:vgpr(s16) + %32:sgpr(s32) = G_IMPLICIT_DEF + %35:vgpr(s32) = G_CONSTANT i32 65535 + %36:sgpr(s32) = G_CONSTANT i32 16 + %37:sgpr(s32) = G_SHL %32:sgpr, %36:sgpr(s32) + %38:vgpr(s32) = G_AND %31:vgpr, %35:vgpr + %39:vgpr(s32) = G_OR %38:vgpr, %37:sgpr + %25:vgpr(<2 x s16>) = G_BITCAST %39:vgpr(s32) + %40:vgpr(<2 x s16>) = COPY %26:sgpr(<2 x s16>) + %27:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %25:vgpr(<2 x s16>), %40:vgpr, shufflemask(0, 0) + %28:vgpr(<2 x s16>) = G_ADD %22:vgpr, %27:vgpr + %41:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %28:vgpr(<2 x s16>), %41:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_neg_vector_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_neg_vector_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %21:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 15, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %21, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %22:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %24:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %24:vgpr(p3) :: (volatile load 4, addrspace 3) + %25:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %25:vgpr(p3) :: (volatile load 4, addrspace 3) + %26:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %26:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %27:vgpr(<2 x s16>) = COPY %22:sgpr(<2 x s16>) + %21:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %20:vgpr(<2 x s16>), %27:vgpr, shufflemask(1, 1) + %23:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %21:vgpr + %28:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %23:vgpr(<2 x s16>), %28:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_vector_neg_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_vector_neg_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %32:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 10, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %32, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %38:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %38:vgpr(p3) :: (volatile load 4, addrspace 3) + %39:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %39:vgpr(p3) :: (volatile load 4, addrspace 3) + %40:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %40:vgpr(p3) :: (volatile load 4, addrspace 3) + %35:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %27:sgpr(s32) = G_CONSTANT i32 16 + %41:vgpr(s32) = COPY %27:sgpr(s32) + %37:vgpr(s32) = G_LSHR %35:vgpr, %41:vgpr(s32) + %20:vgpr(s16) = G_TRUNC %37:vgpr(s32) + %22:vgpr(s16) = G_FNEG %20:vgpr + %25:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %28:vgpr(s32) = G_ZEXT %22:vgpr(s16) + %42:vgpr(s32) = COPY %27:sgpr(s32) + %29:vgpr(s32) = G_SHL %28:vgpr, %42:vgpr(s32) + %30:sgpr(s32) = G_CONSTANT i32 65535 + %43:vgpr(s32) = COPY %30:sgpr(s32) + %33:vgpr(s32) = G_AND %25:vgpr, %43:vgpr + %34:vgpr(s32) = G_OR %33:vgpr, %29:vgpr + %23:vgpr(<2 x s16>) = G_BITCAST %34:vgpr(s32) + %24:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %23:vgpr + %44:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %24:vgpr(<2 x s16>), %44:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: add_vector_scalar_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: add_vector_scalar_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[V_PK_ADD_U16_:%[0-9]+]]:vgpr_32 = V_PK_ADD_U16 8, [[DS_READ_B32_gfx9_]], 12, [[DS_READ_B32_gfx9_1]], 0, 0, 0, 0, 0, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[V_PK_ADD_U16_]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %18:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %20:vgpr(p3) = COPY %12:sgpr(p3) + %15:vgpr(<2 x s16>) = G_LOAD %20:vgpr(p3) :: (volatile load 4, addrspace 3) + %21:vgpr(p3) = COPY %14:sgpr(p3) + %16:vgpr(<2 x s16>) = G_LOAD %21:vgpr(p3) :: (volatile load 4, addrspace 3) + %22:vgpr(<2 x s16>) = COPY %18:sgpr(<2 x s16>) + %17:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %16:vgpr(<2 x s16>), %22:vgpr, shufflemask(1, 1) + %19:vgpr(<2 x s16>) = G_ADD %15:vgpr, %17:vgpr + %23:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %19:vgpr(<2 x s16>), %23:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_scalar_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_scalar_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %20:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 12, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %20, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %21:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %23:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %23:vgpr(p3) :: (volatile load 4, addrspace 3) + %24:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %24:vgpr(p3) :: (volatile load 4, addrspace 3) + %25:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %25:vgpr(p3) :: (volatile load 4, addrspace 3) + %26:vgpr(<2 x s16>) = COPY %21:sgpr(<2 x s16>) + %20:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %19:vgpr(<2 x s16>), %26:vgpr, shufflemask(1, 1) + %22:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %20:vgpr + %27:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %22:vgpr(<2 x s16>), %27:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_neg_vector_lo_neg_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_neg_vector_lo_neg_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %33:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 8, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %33, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %39:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %39:vgpr(p3) :: (volatile load 4, addrspace 3) + %40:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %40:vgpr(p3) :: (volatile load 4, addrspace 3) + %41:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %41:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %36:vgpr(s32) = G_BITCAST %20:vgpr(<2 x s16>) + %28:sgpr(s32) = G_CONSTANT i32 16 + %42:vgpr(s32) = COPY %28:sgpr(s32) + %38:vgpr(s32) = G_LSHR %36:vgpr, %42:vgpr(s32) + %21:vgpr(s16) = G_TRUNC %38:vgpr(s32) + %23:vgpr(s16) = G_FNEG %21:vgpr + %26:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %29:vgpr(s32) = G_ZEXT %23:vgpr(s16) + %43:vgpr(s32) = COPY %28:sgpr(s32) + %30:vgpr(s32) = G_SHL %29:vgpr, %43:vgpr(s32) + %31:sgpr(s32) = G_CONSTANT i32 65535 + %44:vgpr(s32) = COPY %31:sgpr(s32) + %34:vgpr(s32) = G_AND %26:vgpr, %44:vgpr + %35:vgpr(s32) = G_OR %34:vgpr, %30:vgpr + %24:vgpr(<2 x s16>) = G_BITCAST %35:vgpr(s32) + %25:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %24:vgpr + %45:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %25:vgpr(<2 x s16>), %45:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_swap_vector +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_swap_vector + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %20:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 4, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %20, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %21:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %23:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %23:vgpr(p3) :: (volatile load 4, addrspace 3) + %24:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %24:vgpr(p3) :: (volatile load 4, addrspace 3) + %25:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %25:vgpr(p3) :: (volatile load 4, addrspace 3) + %26:vgpr(<2 x s16>) = COPY %21:sgpr(<2 x s16>) + %20:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %19:vgpr(<2 x s16>), %26:vgpr, shufflemask(1, 0) + %22:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %20:vgpr + %27:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %22:vgpr(<2 x s16>), %27:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_swap_neg_vector +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_swap_neg_vector + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %21:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 7, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %21, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %22:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %24:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %24:vgpr(p3) :: (volatile load 4, addrspace 3) + %25:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %25:vgpr(p3) :: (volatile load 4, addrspace 3) + %26:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %26:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %27:vgpr(<2 x s16>) = COPY %22:sgpr(<2 x s16>) + %21:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %20:vgpr(<2 x s16>), %27:vgpr, shufflemask(1, 0) + %23:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %21:vgpr + %28:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %23:vgpr(<2 x s16>), %28:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_blend_vector_neg_vector_0 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_blend_vector_neg_vector_0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %29:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 5, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %29, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %34:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %34:vgpr(p3) :: (volatile load 4, addrspace 3) + %35:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %35:vgpr(p3) :: (volatile load 4, addrspace 3) + %36:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %29:vgpr(s32) = G_BITCAST %20:vgpr(<2 x s16>) + %30:sgpr(s32) = G_CONSTANT i32 16 + %37:vgpr(s32) = COPY %30:sgpr(s32) + %31:vgpr(s32) = G_LSHR %29:vgpr, %37:vgpr(s32) + %27:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %38:vgpr(s32) = G_CONSTANT i32 65535 + %39:vgpr(s32) = G_CONSTANT i32 16 + %40:vgpr(s32) = G_SHL %27:vgpr, %39:vgpr(s32) + %41:vgpr(s32) = G_AND %31:vgpr, %38:vgpr + %42:vgpr(s32) = G_OR %41:vgpr, %40:vgpr + %21:vgpr(<2 x s16>) = G_BITCAST %42:vgpr(s32) + %22:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %21:vgpr + %43:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %22:vgpr(<2 x s16>), %43:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_blend_vector_neg_vector_1 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_blend_vector_neg_vector_1 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %29:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 9, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %29, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %35:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %35:vgpr(p3) :: (volatile load 4, addrspace 3) + %36:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %37:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %37:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %30:vgpr(s32) = G_BITCAST %20:vgpr(<2 x s16>) + %27:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %28:sgpr(s32) = G_CONSTANT i32 16 + %38:vgpr(s32) = COPY %28:sgpr(s32) + %29:vgpr(s32) = G_LSHR %27:vgpr, %38:vgpr(s32) + %39:vgpr(s32) = G_CONSTANT i32 65535 + %40:vgpr(s32) = G_CONSTANT i32 16 + %41:vgpr(s32) = G_SHL %29:vgpr, %40:vgpr(s32) + %42:vgpr(s32) = G_AND %30:vgpr, %39:vgpr + %43:vgpr(s32) = G_OR %42:vgpr, %41:vgpr + %21:vgpr(<2 x s16>) = G_BITCAST %43:vgpr(s32) + %22:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %21:vgpr + %44:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %22:vgpr(<2 x s16>), %44:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_blend_vector_neg_vector_2 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_blend_vector_neg_vector_2 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %29:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 10, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %29, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %35:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %35:vgpr(p3) :: (volatile load 4, addrspace 3) + %36:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %37:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %37:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %30:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %27:vgpr(s32) = G_BITCAST %20:vgpr(<2 x s16>) + %28:sgpr(s32) = G_CONSTANT i32 16 + %38:vgpr(s32) = COPY %28:sgpr(s32) + %29:vgpr(s32) = G_LSHR %27:vgpr, %38:vgpr(s32) + %39:vgpr(s32) = G_CONSTANT i32 65535 + %40:vgpr(s32) = G_CONSTANT i32 16 + %41:vgpr(s32) = G_SHL %29:vgpr, %40:vgpr(s32) + %42:vgpr(s32) = G_AND %30:vgpr, %39:vgpr + %43:vgpr(s32) = G_OR %42:vgpr, %41:vgpr + %21:vgpr(<2 x s16>) = G_BITCAST %43:vgpr(s32) + %22:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %21:vgpr + %44:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %22:vgpr(<2 x s16>), %44:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_vector_blend_vector_neg_vector_3 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_vector_blend_vector_neg_vector_3 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %31:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 13, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %31, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %36:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %36:vgpr(p3) :: (volatile load 4, addrspace 3) + %37:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %37:vgpr(p3) :: (volatile load 4, addrspace 3) + %38:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %38:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(<2 x s16>) = G_FNEG %19:vgpr + %31:vgpr(s32) = G_BITCAST %20:vgpr(<2 x s16>) + %29:sgpr(s32) = G_CONSTANT i32 16 + %39:vgpr(s32) = COPY %29:sgpr(s32) + %33:vgpr(s32) = G_LSHR %31:vgpr, %39:vgpr(s32) + %27:vgpr(s32) = G_BITCAST %19:vgpr(<2 x s16>) + %40:vgpr(s32) = COPY %29:sgpr(s32) + %30:vgpr(s32) = G_LSHR %27:vgpr, %40:vgpr(s32) + %41:vgpr(s32) = G_CONSTANT i32 65535 + %42:vgpr(s32) = G_CONSTANT i32 16 + %43:vgpr(s32) = G_SHL %30:vgpr, %42:vgpr(s32) + %44:vgpr(s32) = G_AND %33:vgpr, %41:vgpr + %45:vgpr(s32) = G_OR %44:vgpr, %43:vgpr + %21:vgpr(<2 x s16>) = G_BITCAST %45:vgpr(s32) + %22:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %21:vgpr + %46:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %22:vgpr(<2 x s16>), %46:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: bitcast_fneg_f32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: bitcast_fneg_f32 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[DS_READ_B32_gfx9_1]], implicit $exec + ; GFX9: %14:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[DS_READ_B32_gfx9_]], 8, [[V_XOR_B32_e32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %14, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %15:sgpr(p3) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %19:vgpr(p3) = COPY %12:sgpr(p3) + %13:vgpr(<2 x s16>) = G_LOAD %19:vgpr(p3) :: (volatile load 4, addrspace 3) + %20:vgpr(p3) = COPY %15:sgpr(p3) + %14:vgpr(s32) = G_LOAD %20:vgpr(p3) :: (volatile load 4, addrspace 3) + %16:vgpr(s32) = G_FNEG %14:vgpr + %17:vgpr(<2 x s16>) = G_BITCAST %16:vgpr(s32) + %18:vgpr(<2 x s16>) = G_FADD %13:vgpr, %17:vgpr + %21:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %18:vgpr(<2 x s16>), %21:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: shuffle_bitcast_fneg_f32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: shuffle_bitcast_fneg_f32 + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 + ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[DS_READ_B32_gfx9_1]], implicit $exec + ; GFX9: %17:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[DS_READ_B32_gfx9_]], 4, [[V_XOR_B32_e32_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %17, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %15:sgpr(p3) = G_IMPLICIT_DEF + %19:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %21:vgpr(p3) = COPY %12:sgpr(p3) + %13:vgpr(<2 x s16>) = G_LOAD %21:vgpr(p3) :: (volatile load 4, addrspace 3) + %22:vgpr(p3) = COPY %15:sgpr(p3) + %14:vgpr(s32) = G_LOAD %22:vgpr(p3) :: (volatile load 4, addrspace 3) + %16:vgpr(s32) = G_FNEG %14:vgpr + %17:vgpr(<2 x s16>) = G_BITCAST %16:vgpr(s32) + %23:vgpr(<2 x s16>) = COPY %19:sgpr(<2 x s16>) + %18:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %17:vgpr(<2 x s16>), %23:vgpr, shufflemask(1, 0) + %20:vgpr(<2 x s16>) = G_FADD %13:vgpr, %18:vgpr + %24:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %20:vgpr(<2 x s16>), %24:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: bitcast_lo_elt_op_sel +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: bitcast_lo_elt_op_sel + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073758208 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GFX9: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY4]], 0, 0, implicit $exec :: (volatile load 2, addrspace 1) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9: %24:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[DS_READ_B32_gfx9_2]], 8, [[COPY5]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: %26:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 4, %24, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %26, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %21:sgpr(p1) = G_IMPLICIT_DEF + %23:sgpr(s16) = G_FCONSTANT half 0xH4000 + %28:sgpr(s32) = G_ANYEXT %23:sgpr(s16) + %22:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %28:sgpr(s32), %28:sgpr(s32) + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %29:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %29:vgpr(p3) :: (volatile load 4, addrspace 3) + %30:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %30:vgpr(p3) :: (volatile load 4, addrspace 3) + %31:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %31:vgpr(p3) :: (volatile load 4, addrspace 3) + %32:vgpr(p1) = COPY %21:sgpr(p1) + %27:vgpr(s32) = G_LOAD %32:vgpr(p1) :: (volatile load 2, addrspace 1) + %33:vgpr(<2 x s16>) = COPY %22:sgpr(<2 x s16>) + %24:vgpr(<2 x s16>) = G_FADD %19:vgpr, %33:vgpr + %25:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %24:vgpr(<2 x s16>), %19:vgpr, shufflemask(1, 0) + %26:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %25:vgpr + %34:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %26:vgpr(<2 x s16>), %34:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: mix_elt_types_op_sel +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: mix_elt_types_op_sel + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073758208 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY1]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY2]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORD_IMM]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 8, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GFX9: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY4]], 0, 0, implicit $exec :: (volatile load 2, addrspace 1) + ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GFX9: [[GLOBAL_LOAD_USHORT1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY5]], 0, 0, implicit $exec :: (volatile load 2, addrspace 1) + ; GFX9: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9: %38:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[DS_READ_B32_gfx9_2]], 8, [[COPY6]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: %40:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 4, %38, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %40, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %2:sgpr(p4) = COPY $sgpr0_sgpr1 + %21:sgpr(p1) = G_IMPLICIT_DEF + %32:sgpr(s16) = G_FCONSTANT half 0xH4000 + %40:sgpr(s32) = G_ANYEXT %32:sgpr(s16) + %31:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %40:sgpr(s32), %40:sgpr(s32) + %7:sgpr(s64) = G_CONSTANT i64 36 + %8:sgpr(p4) = G_PTR_ADD %2:sgpr, %7:sgpr(s64) + %9:sgpr(p1) = G_LOAD %8:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %10:sgpr(s64) = G_CONSTANT i64 44 + %11:sgpr(p4) = G_PTR_ADD %2:sgpr, %10:sgpr(s64) + %12:sgpr(p3) = G_LOAD %11:sgpr(p4) :: (dereferenceable invariant load 4, addrspace 4) + %13:sgpr(s32) = G_CONSTANT i32 4 + %14:sgpr(p3) = G_PTR_ADD %12:sgpr, %13:sgpr(s32) + %15:sgpr(s32) = G_CONSTANT i32 8 + %16:sgpr(p3) = G_PTR_ADD %12:sgpr, %15:sgpr(s32) + %43:vgpr(p3) = COPY %12:sgpr(p3) + %17:vgpr(<2 x s16>) = G_LOAD %43:vgpr(p3) :: (volatile load 4, addrspace 3) + %44:vgpr(p3) = COPY %14:sgpr(p3) + %18:vgpr(<2 x s16>) = G_LOAD %44:vgpr(p3) :: (volatile load 4, addrspace 3) + %45:vgpr(p3) = COPY %16:sgpr(p3) + %19:vgpr(<2 x s16>) = G_LOAD %45:vgpr(p3) :: (volatile load 4, addrspace 3) + %46:vgpr(p1) = COPY %21:sgpr(p1) + %24:vgpr(s32) = G_ZEXTLOAD %46:vgpr(p1) :: (volatile load 2, addrspace 1) + %47:vgpr(p1) = COPY %21:sgpr(p1) + %39:vgpr(s32) = G_LOAD %47:vgpr(p1) :: (volatile load 2, addrspace 1) + %36:vgpr(s16) = G_TRUNC %24:vgpr(s32) + %38:sgpr(s16) = G_CONSTANT i16 1 + %48:vgpr(s16) = COPY %38:sgpr(s16) + %27:vgpr(s16) = nuw nsw G_SHL %36:vgpr, %48:vgpr(s16) + %41:vgpr(s32) = G_ANYEXT %27:vgpr(s16) + %42:sgpr(s32) = G_IMPLICIT_DEF + %49:vgpr(s32) = G_CONSTANT i32 65535 + %50:sgpr(s32) = G_CONSTANT i32 16 + %51:sgpr(s32) = G_SHL %42:sgpr, %50:sgpr(s32) + %52:vgpr(s32) = G_AND %41:vgpr, %49:vgpr + %53:vgpr(s32) = G_OR %52:vgpr, %51:sgpr + %28:vgpr(<2 x s16>) = G_BITCAST %53:vgpr(s32) + %54:vgpr(<2 x s16>) = COPY %31:sgpr(<2 x s16>) + %33:vgpr(<2 x s16>) = G_FADD %19:vgpr, %54:vgpr + %34:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %33:vgpr(<2 x s16>), %28:vgpr, shufflemask(1, 0) + %35:vgpr(<2 x s16>) = G_FMA %17:vgpr, %18:vgpr, %34:vgpr + %55:vgpr(p1) = COPY %9:sgpr(p1) + G_STORE %35:vgpr(<2 x s16>), %55:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_shuffle_bitcast +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_shuffle_bitcast + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %20:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 8, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %20, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %29:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %29:vgpr(p3) :: (volatile load 4, addrspace 3) + %30:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %30:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %31:vgpr(p3) = COPY %19:sgpr(p3) + %26:vgpr(s32) = G_LOAD %31:vgpr(p3) :: (volatile load 4, addrspace 3) + %27:vgpr(<2 x s16>) = G_BITCAST %26:vgpr(s32) + %28:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %27:vgpr + %32:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %28:vgpr(<2 x s16>), %32:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_shuffle_bitcast_lo +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_shuffle_bitcast_lo + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %23:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 0, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %23, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %29:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %31:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %31:vgpr(p3) :: (volatile load 4, addrspace 3) + %32:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %32:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %33:vgpr(p3) = COPY %19:sgpr(p3) + %26:vgpr(s32) = G_LOAD %33:vgpr(p3) :: (volatile load 4, addrspace 3) + %27:vgpr(<2 x s16>) = G_BITCAST %26:vgpr(s32) + %34:vgpr(<2 x s16>) = COPY %29:sgpr(<2 x s16>) + %28:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %27:vgpr(<2 x s16>), %34:vgpr, shufflemask(0, 0) + %30:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %28:vgpr + %35:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %30:vgpr(<2 x s16>), %35:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_shuffle_bitcast_hi +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_shuffle_bitcast_hi + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %23:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 12, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %23, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %29:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %31:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %31:vgpr(p3) :: (volatile load 4, addrspace 3) + %32:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %32:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %33:vgpr(p3) = COPY %19:sgpr(p3) + %26:vgpr(s32) = G_LOAD %33:vgpr(p3) :: (volatile load 4, addrspace 3) + %27:vgpr(<2 x s16>) = G_BITCAST %26:vgpr(s32) + %34:vgpr(<2 x s16>) = COPY %29:sgpr(<2 x s16>) + %28:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %27:vgpr(<2 x s16>), %34:vgpr, shufflemask(1, 1) + %30:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %28:vgpr + %35:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %30:vgpr(<2 x s16>), %35:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... +--- +name: fma_vector_shuffle_bitcast_swap +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: fma_vector_shuffle_bitcast_swap + ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 44, 0 :: (dereferenceable invariant load 8, align 4, addrspace 4) + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY3]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9: [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY4]], 4, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; GFX9: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY5]], 0, 0, implicit $exec :: (volatile load 4, addrspace 3) + ; GFX9: %23:vgpr_32 = nofpexcept V_PK_FMA_F16 8, [[DS_READ_B32_gfx9_]], 8, [[DS_READ_B32_gfx9_1]], 4, [[DS_READ_B32_gfx9_2]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], %23, [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: S_ENDPGM 0 + %3:sgpr(p4) = COPY $sgpr0_sgpr1 + %29:sgpr(<2 x s16>) = G_IMPLICIT_DEF + %8:sgpr(s64) = G_CONSTANT i64 36 + %9:sgpr(p4) = G_PTR_ADD %3:sgpr, %8:sgpr(s64) + %10:sgpr(p1) = G_LOAD %9:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %11:sgpr(s64) = G_CONSTANT i64 44 + %12:sgpr(p4) = G_PTR_ADD %3:sgpr, %11:sgpr(s64) + %13:sgpr(<2 x s32>) = G_LOAD %12:sgpr(p4) :: (dereferenceable invariant load 8, align 4, addrspace 4) + %14:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 0 + %17:sgpr(s32) = G_EXTRACT %13:sgpr(<2 x s32>), 32 + %22:sgpr(s32) = G_CONSTANT i32 4 + %16:sgpr(p3) = G_INTTOPTR %14:sgpr(s32) + %23:sgpr(p3) = G_PTR_ADD %16:sgpr, %22:sgpr(s32) + %31:vgpr(p3) = COPY %16:sgpr(p3) + %24:vgpr(<2 x s16>) = G_LOAD %31:vgpr(p3) :: (volatile load 4, addrspace 3) + %32:vgpr(p3) = COPY %23:sgpr(p3) + %25:vgpr(<2 x s16>) = G_LOAD %32:vgpr(p3) :: (volatile load 4, addrspace 3) + %19:sgpr(p3) = G_INTTOPTR %17:sgpr(s32) + %33:vgpr(p3) = COPY %19:sgpr(p3) + %26:vgpr(s32) = G_LOAD %33:vgpr(p3) :: (volatile load 4, addrspace 3) + %27:vgpr(<2 x s16>) = G_BITCAST %26:vgpr(s32) + %34:vgpr(<2 x s16>) = COPY %29:sgpr(<2 x s16>) + %28:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %27:vgpr(<2 x s16>), %34:vgpr, shufflemask(1, 0) + %30:vgpr(<2 x s16>) = G_FMA %24:vgpr, %25:vgpr, %28:vgpr + %35:vgpr(p1) = COPY %10:sgpr(p1) + G_STORE %30:vgpr(<2 x s16>), %35:vgpr(p1) :: (store 4, addrspace 1) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -300,23 +300,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +324,20 @@ ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -300,23 +300,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -327,23 +324,20 @@ ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/op-sel.ll @@ -0,0 +1,854 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +define amdgpu_kernel void @fma_vector_vector_scalar_lo_no_shuffle(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_lo_no_shuffle: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %scalar0.vec1 = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.vec2 = insertelement <2 x half> %scalar0.vec1, half %scalar0, i32 1 + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.vec2) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + %neg.scalar0.broadcast = fneg <2 x half> %scalar0.broadcast + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer + %neg.neg.scalar0.broadcast = fneg <2 x half> %neg.scalar0.broadcast + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_neg_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0 + %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_vector_scalar_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0 + %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { +; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: ds_read_u16 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2 + %neg.scalar0 = fneg half %scalar0 + %neg.scalar0.bc = bitcast half %neg.scalar0 to i16 + + %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer + + %result = add <2 x i16> %vec0, %neg.scalar0.broadcast + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_neg_vector_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.fneg = fneg <2 x half> %vec2 + %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_vector_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1 = extractelement <2 x half> %vec2, i32 1 + %neg.vec2.elt1 = fneg half %vec2.elt1 + + %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { +; GCN-LABEL: add_vector_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4 + + %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> + %result = add <2 x i16> %vec0, %vec1.elt1.broadcast + + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_scalar_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_neg_vector_lo_neg_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %neg.vec2 = fneg <2 x half> %vec2 + %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1 + %neg.neg.vec2.elt1 = fneg half %neg.vec2.elt1 + %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1 + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_swap_vector: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_swap_neg_vector: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + + %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_0: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_1: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_2: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: fma_vector_vector_blend_vector_neg_vector_3: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fneg <2 x half> %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: bitcast_fneg_f32: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fneg float %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %result = fadd <2 x half> %vec0, %bc + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: shuffle_bitcast_fneg_f32: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v0, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fneg float %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> + %result = fadd <2 x half> %vec0, %shuf + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: bitcast_lo_elt_op_sel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %fadd = fadd <2 x half> %vec2, + %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +; GCN-LABEL: mix_elt_types_op_sel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: ds_read_b32 v1, v0 +; GCN-NEXT: ds_read_b32 v2, v0 offset:4 +; GCN-NEXT: ds_read_b32 v0, v0 offset:8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_ushort v3, v[0:1], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; kill: killed $vgpr0_vgpr1 +; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %scalar1 = load volatile half, half addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0 + + %fadd = fadd <2 x half> %vec2, + %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_shuffle_bitcast(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, float addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_shuffle_bitcast: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 + + %scalar0.bitcast = bitcast float %scalar0 to <2 x half> + %scalar0.broadcast = shufflevector <2 x half> %scalar0.bitcast, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_shuffle_bitcast_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, float addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_shuffle_bitcast_lo: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 + + %scalar0.bitcast = bitcast float %scalar0 to <2 x half> + %scalar0.broadcast = shufflevector <2 x half> %scalar0.bitcast, <2 x half> undef, <2 x i32> zeroinitializer + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_shuffle_bitcast_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, float addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_shuffle_bitcast_hi: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel:[0,0,1] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 + + %scalar0.bitcast = bitcast float %scalar0 to <2 x half> + %scalar0.broadcast = shufflevector <2 x half> %scalar0.bitcast, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @fma_vector_shuffle_bitcast_swap(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, float addrspace(3)* %arg2) #0 { +; GCN-LABEL: fma_vector_shuffle_bitcast_swap: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v0, v0 offset:4 +; GCN-NEXT: ds_read_b32 v1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel:[0,0,1] op_sel_hi:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 + + %scalar0.bitcast = bitcast float %scalar0 to <2 x half> + %scalar0.broadcast = shufflevector <2 x half> %scalar0.bitcast, <2 x half> undef, <2 x i32> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +