Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -394,15 +394,32 @@ } } +static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { + const Instruction *I = dyn_cast_or_null(MMO->getValue()); + return I && I->getMetadata("amdgpu.noclobber"); +} + // FIXME: Returns uniform if there's no source value information. This is // probably wrong. -static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) { +static bool isScalarLoadLegal(const MachineInstr &MI) { if (!MI.hasOneMemOperand()) return false; const MachineMemOperand *MMO = *MI.memoperands_begin(); + const unsigned AS = MMO->getAddrSpace(); + const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + + // There are no extending SMRD/SMEM loads, and they require 4-byte alignment. return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 && - AMDGPUInstrInfo::isUniformMMO(MMO); + // Can't do a scalar atomic load. + !MMO->isAtomic() && + // Don't use scalar loads for volatile accesses to non-constant address + // spaces. + (IsConst || !MMO->isVolatile()) && + // Memory must be known constant, or not written before this load. + (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && + AMDGPUInstrInfo::isUniformMMO(MMO); } RegisterBankInfo::InstructionMappings @@ -510,9 +527,10 @@ unsigned PtrSize = PtrTy.getSizeInBits(); unsigned AS = PtrTy.getAddressSpace(); LLT LoadTy = MRI.getType(MI.getOperand(0).getReg()); + if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && AS != AMDGPUAS::PRIVATE_ADDRESS) && - isInstrUniformNonExtLoadAlign4(MI)) { + isScalarLoadLegal(MI)) { const InstructionMapping &SSMapping = getInstructionMapping( 1, 1, getOperandsMapping( {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), @@ -2284,7 +2302,7 @@ if (PtrBank == &AMDGPU::SGPRRegBank && (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && AS != AMDGPUAS::PRIVATE_ADDRESS) && - isInstrUniformNonExtLoadAlign4(MI)) { + isScalarLoadLegal(MI)) { // We have a uniform instruction so we want to use an SMRD load ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir @@ -243,9 +243,9 @@ bb.0: liveins: $sgpr0_sgpr1 ; CHECK-LABEL: name: load_global_v8i32_uniform - ; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (load 32, addrspace 1) + ; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (invariant load 32, addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 - %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 1) + %1:_(<8 x s32>) = G_LOAD %0 :: (invariant load 32, addrspace 1) ... --- @@ -256,9 +256,9 @@ bb.0: liveins: $sgpr0_sgpr1 ; CHECK-LABEL: name: load_global_v4i64_uniform - ; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (load 32, addrspace 1) + ; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (invariant load 32, addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 - %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 1) + %1:_(<4 x s64>) = G_LOAD %0 :: (invariant load 32, addrspace 1) ... --- @@ -269,9 +269,9 @@ bb.0: liveins: $sgpr0_sgpr1 ; CHECK-LABEL: name: load_global_v16i32_uniform - ; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (load 64, addrspace 1) + ; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (invariant load 64, addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 - %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 1) + %1:_(<16 x s32>) = G_LOAD %0 :: (invariant load 64, addrspace 1) ... --- @@ -282,9 +282,9 @@ bb.0: liveins: $sgpr0_sgpr1 ; CHECK-LABEL: name: load_global_v8i64_uniform - ; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (load 64, addrspace 1) + ; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (invariant load 64, addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 - %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 1) + %1:_(<8 x s64>) = G_LOAD %0 :: (invariant load 64, addrspace 1) ... --- Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir @@ -1,68 +1,171 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck %s --- | - define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { ret void } - define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) { + define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { + ret void + } + + define amdgpu_kernel void @load_constant_volatile(i32 addrspace(4)* %ptr0) { + ret void + } + + define amdgpu_kernel void @load_global_uniform_invariant(i32 addrspace(1)* %ptr1) { + %tmp0 = load i32, i32 addrspace(1)* %ptr1 + ret void + } + + define amdgpu_kernel void @load_global_uniform_noclobber(i32 addrspace(1)* %ptr1) { + %tmp0 = load i32, i32 addrspace(1)* %ptr1, !amdgpu.noclobber !0 + ret void + } + + define amdgpu_kernel void @load_global_uniform_variant(i32 addrspace(1)* %ptr1) { + %tmp0 = load i32, i32 addrspace(1)* %ptr1 + ret void + } + + define amdgpu_kernel void @load_global_uniform_volatile_invariant(i32 addrspace(1)* %ptr1) { %tmp0 = load i32, i32 addrspace(1)* %ptr1 ret void } + + define amdgpu_kernel void @load_global_uniform_atomic_invariant(i32 addrspace(1)* %ptr1) { + %tmp0 = load i32, i32 addrspace(1)* %ptr1 + ret void + } + define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) { %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0 %tmp2 = load i32, i32 addrspace(1)* %tmp1 ret void } + define void @non_power_of_2() { ret void } + + define amdgpu_kernel void @load_constant_v4i16_from_6_align8(<3 x i16> addrspace(4)* %ptr0) { + ret void + } + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } -... + !0 = !{} +... --- -name : load_constant +name: load_constant legalized: true -# CHECK-LABEL: name: load_constant -# CHECK: registers: -# CHECK: - { id: 0, class: sgpr, preferred-register: '' } -# CHECK: - { id: 1, class: sgpr, preferred-register: '' } - body: | bb.0: liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load 4 from %ir.ptr0, addrspace 4) %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0) ... --- -name: load_global_uniform +name: load_constant_volatile legalized: true -# CHECK-LABEL: name: load_global_uniform -# CHECK: registers: -# CHECK: - { id: 0, class: sgpr, preferred-register: '' } -# CHECK: - { id: 1, class: sgpr, preferred-register: '' } +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_constant_volatile + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (volatile load 4 from %ir.ptr0, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (volatile load 4 from %ir.ptr0) +... + +--- +name: load_global_uniform_invariant +legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_uniform_invariant + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p1) :: (invariant load 4 from %ir.ptr1, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load 4 from %ir.ptr1) +... + +--- +name: load_global_uniform_noclobber +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_uniform_noclobber + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.ptr1, addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1) ... --- -name: load_global_non_uniform +name: load_global_uniform_variant +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_uniform_variant + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.ptr1, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr1) +... + +--- +name: load_global_uniform_volatile_invariant +legalized: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_uniform_volatile_invariant + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (volatile invariant load 4 from %ir.ptr1, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (volatile invariant load 4 from %ir.ptr1) +... + +--- +name: load_global_uniform_atomic_invariant legalized: true -# CHECK-LABEL: name: load_global_non_uniform -# CHECK: registers: -# CHECK: - { id: 0, class: sgpr, preferred-register: '' } -# CHECK: - { id: 1, class: vgpr, preferred-register: '' } -# CHECK: - { id: 2, class: vgpr, preferred-register: '' } +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_uniform_atomic_invariant + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (invariant load acquire 4 from %ir.ptr1, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: (invariant load acquire 4 from %ir.ptr1) +... +--- +name: load_global_non_uniform +legalized: true body: | bb.0: liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: load_global_non_uniform + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1) + ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p1) :: (load 4 from %ir.tmp1, addrspace 1) %0:_(p1) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.tmp1) ... @@ -71,14 +174,29 @@ name: non_power_of_2 legalized: true -# CHECK-LABEL: name: non_power_of_2 -# CHECK: [[S448:%[0-9]+]]:sgpr(s448) = G_IMPLICIT_DEF -# CHECK: sgpr(s32) = G_EXTRACT [[S448]](s448), 0 - body: | bb.0: + ; CHECK-LABEL: name: non_power_of_2 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s448) = G_IMPLICIT_DEF + ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(s32) = G_EXTRACT [[DEF]](s448), 0 + ; CHECK: $sgpr0 = COPY [[EXTRACT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG $sgpr0 %0:_(s448) = G_IMPLICIT_DEF %1:_(s32) = G_EXTRACT %0:_(s448), 0 $sgpr0 = COPY %1:_(s32) SI_RETURN_TO_EPILOG $sgpr0 ... + +--- +name: load_constant_v4i16_from_6_align8 +legalized: true + +body: | + bb.0: + ; CHECK-LABEL: name: load_constant_v4i16_from_6_align8 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1 + ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load 6 from %ir.ptr0, align 8, addrspace 4) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(<4 x s16>) = G_LOAD %0 :: (load 6 from %ir.ptr0, align 8, addrspace 4) + +...