Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -92,6 +92,14 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_mubuf_addr64_atomic : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_mubuf_offset_atomic : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization // directly before before selecting a glue-less load, so hide this Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -196,12 +196,25 @@ MUBUFAddressData parseMUBUFAddress(Register Src) const; + bool selectMUBUFAddr64Impl(MachineOperand &Root, Register &VAddr, + Register &RSrcReg, Register &SOffset, + int64_t &Offset) const; + + bool selectMUBUFOffsetImpl(MachineOperand &Root, Register &RSrcReg, + Register &SOffset, int64_t &Offset) const; + InstructionSelector::ComplexRendererFns selectMUBUFAddr64(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectMUBUFOffset(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectMUBUFOffsetAtomic(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectMUBUFAddr64Atomic(MachineOperand &Root) const; + void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx = -1) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2664,27 +2664,22 @@ ImmOffset = 0; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { +bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl( + MachineOperand &Root, Register &VAddr, Register &RSrcReg, + Register &SOffset, int64_t &Offset) const { // FIXME: Predicates should stop this from reaching here. // addr64 bit was removed for volcanic islands. if (!STI.hasAddr64() || STI.useFlatForGlobal()) - return {}; + return false; MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); if (!shouldUseAddr64(AddrData)) - return {}; + return false; Register N0 = AddrData.N0; Register N2 = AddrData.N2; Register N3 = AddrData.N3; - int64_t Offset = AddrData.Offset; - - // VGPR pointer - Register VAddr; - - // SGPR offset. - Register SOffset; + Offset = AddrData.Offset; // Base pointer for the SRD. Register SRDPtr; @@ -2715,8 +2710,40 @@ } MachineIRBuilder B(*Root.getParent()); - Register RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); + RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr); + splitIllegalMUBUFOffset(B, SOffset, Offset); + return true; +} + +bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( + MachineOperand &Root, Register &RSrcReg, Register &SOffset, + int64_t &Offset) const { + MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); + if (shouldUseAddr64(AddrData)) + return false; + + // N0 -> offset, or + // (N0 + C1) -> offset + Register SRDPtr = AddrData.N0; + Offset = AddrData.Offset; + + // TODO: Look through extensions for 32-bit soffset. + MachineIRBuilder B(*Root.getParent()); + + RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); splitIllegalMUBUFOffset(B, SOffset, Offset); + return true; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const { + Register VAddr; + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) + return {}; // FIXME: Use defaulted operands for trailing 0s and remove from the complex // pattern. @@ -2746,21 +2773,12 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { - MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); - if (shouldUseAddr64(AddrData)) - return {}; - - // N0 -> offset, or - // (N0 + C1) -> offset - Register SRDPtr = AddrData.N0; - int64_t Offset = AddrData.Offset; + Register RSrcReg; Register SOffset; + int64_t Offset = 0; - // TODO: Look through extensions for 32-bit soffset. - MachineIRBuilder B(*Root.getParent()); - - Register RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr); - splitIllegalMUBUFOffset(B, SOffset, Offset); + if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) + return {}; return {{ [=](MachineInstrBuilder &MIB) { // rsrc @@ -2781,6 +2799,62 @@ }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { + Register VAddr; + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) + return {}; + + // FIXME: Use defaulted operands for trailing 0s and remove from the complex + // pattern. + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(RSrcReg); + }, + [=](MachineInstrBuilder &MIB) { // vaddr + MIB.addReg(VAddr); + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { // offset + MIB.addImm(Offset); + }, + addZeroImm // slc + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { + Register RSrcReg; + Register SOffset; + int64_t Offset = 0; + + if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { // rsrc + MIB.addReg(RSrcReg); + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (SOffset) + MIB.addReg(SOffset); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset + addZeroImm // slc + }}; +} + void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s -# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s @@ -16,10 +16,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_ADD [[COPY]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[ATOMICRMW_ADD]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -57,9 +62,14 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_ADD [[COPY]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -93,12 +103,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 2047 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[ATOMICRMW_ADD]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -148,11 +161,14 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 2047 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -198,12 +214,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 2048 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[ATOMICRMW_ADD]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -263,11 +282,14 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 2048 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -323,12 +345,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4095 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[ATOMICRMW_ADD]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -388,11 +413,14 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4095 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -448,12 +476,16 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4097 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr_32(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) - ; GFX6: $vgpr0 = COPY [[ATOMICRMW_ADD]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX6: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -523,11 +555,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4097 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s32) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097 + ; GFX6: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -593,10 +629,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s64 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vreg_64(s64) = G_ATOMICRMW_ADD [[COPY]](p1), [[COPY1]] :: (load store seq_cst 8, addrspace 1) - ; GFX6: $vgpr0_vgpr1 = COPY [[ATOMICRMW_ADD]](s64) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s64 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -634,9 +675,14 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s64) = G_ATOMICRMW_ADD [[COPY]](p1), [[COPY1]] :: (load store seq_cst 8, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -670,12 +716,15 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4095 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vreg_64(s64) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 8, addrspace 1) - ; GFX6: $vgpr0_vgpr1 = COPY [[ATOMICRMW_ADD]](s64) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]] ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -735,11 +784,14 @@ ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 - ; GFX6: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 4095 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX6: [[ATOMICRMW_ADD:%[0-9]+]]:vgpr(s64) = G_ATOMICRMW_ADD [[PTR_ADD]](p1), [[COPY1]] :: (load store seq_cst 8, addrspace 1) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 + ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 + ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 + ; GFX6: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -1089,9 +1089,195 @@ ret float %val } -; define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr) { -; %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4095 -; %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst -; %cast = bitcast i32 %result to float -; ret float %cast -; } +define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4095(i32 addrspace(1)* inreg %ptr) { +; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_movk_i32 s4, 0x3ffc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_movk_i32 s4, 0x3ffc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; return to shader part epilog + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4095 + %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst + %cast = bitcast i32 %result to float + ret float %cast +} + +define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(i32 addrspace(1)* inreg %ptr) { +; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s5, 4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s4, 0 +; GFX7-NEXT: s_mov_b32 s5, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; return to shader part epilog + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4294967296 + %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst + %cast = bitcast i32 %result to float + ret float %cast +} + +define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4095(i32 addrspace(1)* %ptr) { +; GFX6-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v2, 2 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: s_movk_i32 s4, 0x3ffc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], s4 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_movk_i32 s4, 0x3ffc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], s4 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: ; return to shader part epilog + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4095 + %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst + %cast = bitcast i32 %result to float + ret float %cast +} + +define amdgpu_ps float @mubuf_atomicrmw_vgpr_ptr_offset4294967296(i32 addrspace(1)* %ptr) { +; GFX6-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s0, 0 +; GFX6-NEXT: s_mov_b32 s1, 4 +; GFX6-NEXT: v_mov_b32_e32 v2, 2 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, 0 +; GFX7-NEXT: s_mov_b32 s1, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: ; return to shader part epilog + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4294967296 + %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst + %cast = bitcast i32 %result to float + ret float %cast +} + +define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(i32 addrspace(1)* inreg %ptr, i32 %voffset) { +; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, 4, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, 4, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, 4, v0 +; GFX6-NEXT: s_mov_b32 s0, s2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: s_mov_b32 s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: v_mul_lo_u32 v3, 4, v1 +; GFX7-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX7-NEXT: v_mul_hi_u32 v4, 4, v0 +; GFX7-NEXT: v_mul_lo_u32 v1, 4, v0 +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: ; return to shader part epilog + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset + %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst + %cast = bitcast i32 %result to float + ret float %cast +}