Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1305,7 +1305,8 @@ if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isModuleEntryFunction()) { + if (!MFI->isModuleEntryFunction() && + !GV->getName().equals("llvm.amdgcn.module.lds")) { SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2286,7 +2286,8 @@ SIMachineFunctionInfo *MFI = MF.getInfo(); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isModuleEntryFunction()) { + if (!MFI->isModuleEntryFunction() && + !GV->getName().equals("llvm.amdgcn.module.lds")) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -212,8 +212,6 @@ Align MaxAlign = AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment - Constant *InstanceAddress = Constant::getIntegerValue( - PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0)); GlobalVariable *SGV = new GlobalVariable( M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy), @@ -236,7 +234,7 @@ GlobalVariable *GV = LocalVars[I]; Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)}; GV->replaceAllUsesWith( - ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx)); + ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx)); GV->eraseFromParent(); } Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -64,7 +64,7 @@ void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) { if (isModuleEntryFunction()) { - GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds"); + const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds"); if (GV) { unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV); (void)Offset; Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -19,7 +19,7 @@ ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4 ; CHECK-LABEL: @get_func() -; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define i32 @get_func() local_unnamed_addr #0 { entry: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -27,7 +27,7 @@ } ; CHECK-LABEL: @set_func(i32 %x) -; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4 +; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define void @set_func(i32 %x) local_unnamed_addr #1 { entry: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -36,9 +36,9 @@ ; CHECK-LABEL: @timestwo() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] -; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 ; CHECK: %mul = mul i32 %ld, 2 -; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4 +; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 define amdgpu_kernel void @timestwo() { %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 %mul = mul i32 %ld, 2 Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll @@ -3,9 +3,9 @@ ; CHECK: %llvm.amdgcn.module.lds.t = type { double, float } -; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8 +; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to float*), align 8 -; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8 +; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to double*), align 8 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Check that module LDS is allocated at address 0 and kernel starts its +; allocation past module LDS. + +@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1 +@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16 + +; GCN-LABEL: {{^}}k0: +; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: ds_write_b8 [[NULL]], [[ONE]] +; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16 +define amdgpu_kernel void @k0() { +; OPT-LABEL: @k0( +; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ] +; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* +; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 +; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* +; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16 +; OPT-NEXT: ret void +; + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)* + store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16 + ret void +} + +; GCN-LABEL: {{^}}f0: +; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3 +; GCN: ds_write_b8 [[NULL]], [[TREE]] +define void @f0() { +; OPT-LABEL: @f0( +; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)* +; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1 +; OPT-NEXT: ret void +; + %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)* + store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1 + ret void +} Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll @@ -29,7 +29,7 @@ @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata" ; CHECK-LABEL: @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 +; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4 define void @func() { %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic Index: llvm/test/CodeGen/AMDGPU/lower-module-lds.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-module-lds.ll +++ llvm/test/CodeGen/AMDGPU/lower-module-lds.ll @@ -21,12 +21,12 @@ ; Instance of new type, aligned to max of element alignment ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8 -; Use in func rewritten to access struct at address zero, which prints as null +; Use in func rewritten to access struct at address zero ; CHECK-LABEL: @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0 -; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0 +; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4 ; CHECK: %val1 = add i32 %val0, 4 -; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4 +; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4 ; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic define void @func() { %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic @@ -41,7 +41,7 @@ ; CHECK-LABEL: @kern_call() ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ] ; CHECK: call void @func() -; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0 +; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4 define amdgpu_kernel void @kern_call() { call void @func() %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic