Index: lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- lib/Target/AMDGPU/AMDGPUGISel.td +++ lib/Target/AMDGPU/AMDGPUGISel.td @@ -74,6 +74,7 @@ // directly before before selecting a glue-less load, so hide this // distinction. def : GINodeEquiv; +def : GINodeEquiv; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -90,9 +90,11 @@ void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl &AddrInfo) const; bool selectSMRD(MachineInstr &I, ArrayRef AddrInfo) const; + + void initM0(MachineInstr &I) const; bool selectG_LOAD(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + bool selectG_STORE(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; bool selectG_SELECT(MachineInstr &I) const; - bool selectG_STORE(MachineInstr &I) const; bool selectG_BRCOND(MachineInstr &I) const; bool selectG_FRAME_INDEX(MachineInstr &I) const; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -840,17 +840,22 @@ return Ret; } -bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { +bool AMDGPUInstructionSelector::selectG_STORE( + MachineInstr &I, CodeGenCoverage &CoverageInfo) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - DebugLoc DL = I.getDebugLoc(); - unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI); - if (PtrSize != 64) { - LLVM_DEBUG(dbgs() << "Unhandled address space\n"); - return false; + const DebugLoc &DL = I.getDebugLoc(); + + LLT PtrTy = MRI.getType(I.getOperand(1).getReg()); + if (PtrTy.getSizeInBits() != 64) { + initM0(I); + return selectImpl(I, CoverageInfo); } + if (selectImpl(I, CoverageInfo)) + return true; + unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); unsigned Opcode; @@ -1236,8 +1241,7 @@ return false; } -bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I, - CodeGenCoverage &CoverageInfo) const { +void AMDGPUInstructionSelector::initM0(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -1250,7 +1254,11 @@ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addImm(-1); } +} +bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { + initM0(I); return selectImpl(I, CoverageInfo); } @@ -1370,12 +1378,11 @@ return selectImpl(I, CoverageInfo); case TargetOpcode::G_LOAD: return selectG_LOAD(I, CoverageInfo); + case TargetOpcode::G_SELECT: return selectG_SELECT(I); case TargetOpcode::G_STORE: - if (selectImpl(I, CoverageInfo)) - return true; - return selectG_STORE(I); + return selectG_STORE(I, CoverageInfo); case TargetOpcode::G_TRUNC: return selectG_TRUNC(I); case TargetOpcode::G_SEXT: Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -493,11 +493,13 @@ def store_align8_local: PatFrag<(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)>, Aligned<8> { let IsStore = 1; - + let IsTruncStore = 0; } + def store_align16_local: PatFrag<(ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)>, Aligned<16> { let IsStore = 1; + let IsTruncStore = 0; } Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -668,7 +668,7 @@ class DSWritePat : GCNPat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)), - (inst $ptr, $value, offset:$offset, (i1 gds)) + (inst $ptr, getVregSrcForVT.ret:$value, offset:$offset, (i1 gds)) >; multiclass DSWritePat_mc { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -475,13 +475,34 @@ def store_glue_align16 : PatFrag<(ops node:$val, node:$ptr), (store_glue node:$val, node:$ptr)>, Aligned<16>; -def store_local_m0 : StoreFrag, LocalAddress; -def truncstorei8_local_m0 : StoreFrag, LocalAddress; -def truncstorei16_local_m0 : StoreFrag, LocalAddress; +def store_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let IsTruncStore = 0; +} + +def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i8; +} + + +def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr)> { + let IsStore = 1; + let MemoryVT = i16; +} + +// FIXME: atomic store doesn't work. def atomic_store_local_m0 : StoreFrag, LocalAddress; +def store_align8_local_m0 : StoreFrag, LocalAddress { + let IsTruncStore = 0; +} -def store_align8_local_m0 : StoreFrag, LocalAddress; -def store_align16_local_m0 : StoreFrag, LocalAddress; +def store_align16_local_m0 : StoreFrag, LocalAddress { + let IsTruncStore = 0; +} } def si_setcc_uniform : PatFrag < Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir @@ -0,0 +1,262 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX6 %s +# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s + +--- + +name: store_local_s32_to_4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr4 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: store_local_s32_to_4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) + ; GFX7-LABEL: name: store_local_s32_to_4 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) + ; GFX9-LABEL: name: store_local_s32_to_4 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p3) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 3) + +... + +--- + +name: store_local_s32_to_2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr4 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: store_local_s32_to_2 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3) + ; GFX7-LABEL: name: store_local_s32_to_2 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3) + ; GFX9-LABEL: name: store_local_s32_to_2 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: DS_WRITE_B16_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 2, addrspace 3) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p3) = COPY $vgpr1 + G_STORE %0, %1 :: (store 2, align 2, addrspace 3) + +... + +--- + +name: store_local_s32_to_1 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr4 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: store_local_s32_to_1 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) + ; GFX7-LABEL: name: store_local_s32_to_1 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) + ; GFX9-LABEL: name: store_local_s32_to_1 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: DS_WRITE_B8_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 1, addrspace 3) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p3) = COPY $vgpr1 + G_STORE %0, %1 :: (store 1, align 1, addrspace 3) + +... + +--- + +name: store_local_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr4 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: store_local_v2s16 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p3) :: (store 4, addrspace 3) + ; GFX7-LABEL: name: store_local_v2s16 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p3) :: (store 4, addrspace 3) + ; GFX9-LABEL: name: store_local_v2s16 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX9: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p3) :: (store 4, addrspace 3) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(p3) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 3) + +... + +--- + +name: store_local_p3 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr4 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX6-LABEL: name: store_local_p3 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](p3), [[COPY1]](p3) :: (store 4, addrspace 3) + ; GFX7-LABEL: name: store_local_p3 + ; GFX7: liveins: $vgpr0, $vgpr1 + ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: G_STORE [[COPY]](p3), [[COPY1]](p3) :: (store 4, addrspace 3) + ; GFX9-LABEL: name: store_local_p3 + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr1 + ; GFX9: G_STORE [[COPY]](p3), [[COPY1]](p3) :: (store 4, addrspace 3) + %0:vgpr(p3) = COPY $vgpr0 + %1:vgpr(p3) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 3) + +... + +--- + +name: store_local_s32_to_1_constant_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) + ; GFX7-LABEL: name: store_local_s32_to_1_constant_4095 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) + ; GFX9-LABEL: name: store_local_s32_to_1_constant_4095 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3) + %0:vgpr(p3) = G_CONSTANT i32 4095 + %1:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 1, align 1, addrspace 3) + +... + +--- + +name: store_local_s32_to_1_constant_4096 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + scratchWaveOffsetReg: $sgpr4 + stackPtrOffsetReg: $sgpr32 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + + ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) + ; GFX7-LABEL: name: store_local_s32_to_1_constant_4096 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) + ; GFX9-LABEL: name: store_local_s32_to_1_constant_4096 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3) + %0:vgpr(p3) = G_CONSTANT i32 4096 + %1:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 1, align 1, addrspace 3) + +...