diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -31,6 +31,7 @@ #include "Utils/AMDGPUMemoryUtils.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" @@ -144,6 +145,93 @@ ""); } + static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) + { + // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS + // global may have uses from multiple different functions as a result. + // This pass specialises LDS variables with respect to the kernel that + // allocates them. + + // This is semantically equivalent to: + // for (auto &F : M.functions()) + // for (auto &BB : F) + // for (auto &I : BB) + // for (Use &Op : I.operands()) + // if (constantExprUsesLDS(Op)) + // replaceConstantExprInFunction(I, Op); + + bool Changed = false; + + // Find all ConstantExpr that are direct users of an LDS global + SmallVector Stack; + for (auto &GV : M.globals()) + if (AMDGPU::isLDSVariableToLower(GV)) + for (User *U : GV.users()) + if (ConstantExpr *C = dyn_cast(U)) + Stack.push_back(C); + + // Expand to include constexpr users of direct users + SetVector ConstExprUsersOfLDS; + while (!Stack.empty()) { + ConstantExpr *V = Stack.pop_back_val(); + if (ConstExprUsersOfLDS.contains(V)) + continue; + + ConstExprUsersOfLDS.insert(V); + + for (auto *Nested : V->users()) + if (ConstantExpr *CE = dyn_cast(Nested)) + Stack.push_back(CE); + } + + // Find all instructions that use any of the ConstExpr users of LDS + SetVector InstructionWorklist; + for (ConstantExpr *CE : ConstExprUsersOfLDS) + for (User *U : CE->users()) + if (auto *I = dyn_cast(U)) + InstructionWorklist.insert(I); + + // Replace those ConstExpr operands with instructions + while (!InstructionWorklist.empty()) { + Instruction *I = InstructionWorklist.pop_back_val(); + for (Use &U : I->operands()) { + + auto *BI = I; + if (auto *Phi = dyn_cast(I)) { + BasicBlock *BB = Phi->getIncomingBlock(U); + BasicBlock::iterator It = BB->getFirstInsertionPt(); + if (It != BB->end()) { + BI = &(*(It)); + } else { + // getAsInstruction inserts before the instruction passed, + // so if the appropriate basic block has no instruction, create + // a no-op one to use + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(BB); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); + Function *Decl = + Intrinsic::getDeclaration(&M, Intrinsic::donothing); + Instruction *Call = Builder.CreateCall(FTy, Decl, {}); + BI = Call; + } + } + + if (ConstantExpr *C = dyn_cast(U.get())) { + if (ConstExprUsersOfLDS.contains(C)) { + Changed = true; + Instruction *NI = C->getAsInstruction(BI); + InstructionWorklist.insert(NI); + U.set(NI); + C->removeDeadConstantUsers(); + } + } + } + } + + return Changed; + } + public: static char ID; @@ -156,6 +244,8 @@ CallGraph CG = CallGraph(M); bool Changed = superAlignLDSGlobals(M); + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + // Move variables used by functions into amdgcn.module.lds std::vector ModuleScopeVariables = AMDGPU::findLDSVariablesToLower(M, nullptr); @@ -213,19 +303,12 @@ if (!AMDGPU::isKernel(F.getCallingConv())) continue; + fprintf(stderr, "Looking to lower variables in kernel %s\n", + F.getName().str().c_str()); + std::vector KernelUsedVariables = AMDGPU::findLDSVariablesToLower(M, &F); - // Replace all constant uses with instructions if they belong to the - // current kernel. Unnecessary, removing will cause test churn. - for (GlobalVariable *GV : KernelUsedVariables) { - for (User *U : make_early_inc_range(GV->users())) { - if (ConstantExpr *C = dyn_cast(U)) - AMDGPU::replaceConstantUsesInFunction(C, &F); - } - GV->removeDeadConstantUsers(); - } - if (!KernelUsedVariables.empty()) { std::string VarName = (Twine("llvm.amdgcn.kernel.") + F.getName() + ".lds").str(); @@ -234,6 +317,8 @@ std::tie(SGV, LDSVarsToConstantGEP) = createLDSVariableReplacement(M, VarName, KernelUsedVariables); + fprintf(stderr, "Created variable %s\n", SGV->getName().str().c_str()); + removeFromUsedLists(M, KernelUsedVariables); replaceLDSVariablesWithStruct( M, KernelUsedVariables, SGV, LDSVarsToConstantGEP, [&F](Use &U) { @@ -245,9 +330,12 @@ } for (auto &GV : make_early_inc_range(M.globals())) - if (AMDGPU::isLDSVariableToLower(GV) && GV.use_empty()) - GV.eraseFromParent(); - + if (AMDGPU::isLDSVariableToLower(GV)) { + GV.removeDeadConstantUsers(); + if (GV.use_empty()) + GV.eraseFromParent(); + } + return Changed; } @@ -396,6 +484,12 @@ LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); + fprintf(stderr, "replaceLDSVariablesWithStruct on variables:\n"); + for (auto & GV : LDSVarsToTransform) + { + GV->dump(); + } + // Create alias.scope and their lists. Each field in the new structure // does not alias with all other fields. SmallVector AliasScopes; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -33,9 +33,6 @@ std::vector findLDSVariablesToLower(Module &M, const Function *F); -/// Replace all uses of constant \p C with instructions in \p F. -void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); - /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check /// if this is actually a memory update or an artificial clobber to facilitate /// ordering constraints. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -32,35 +32,6 @@ GV->getValueType()); } -static void collectFunctionUses(User *U, const Function *F, - SetVector &InstUsers) { - SmallVector Stack{U}; - - while (!Stack.empty()) { - U = Stack.pop_back_val(); - - if (auto *I = dyn_cast(U)) { - if (I->getFunction() == F) - InstUsers.insert(I); - continue; - } - - if (!isa(U)) - continue; - - append_range(Stack, U->users()); - } -} - -void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { - SetVector InstUsers; - - collectFunctionUses(C, F, InstUsers); - for (Instruction *I : InstUsers) { - convertConstantExprsToInstructions(I, C); - } -} - static bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { // We are not interested in kernel LDS lowering for module LDS itself. diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-constexpr.ll @@ -107,9 +107,11 @@ ; Multiple constexpr use in a same instruction. define amdgpu_kernel void @k5() { ; CHECK-LABEL: @k5( -; CHECK-NEXT: %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]* -; CHECK-NEXT: %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0 -; CHECK-NEXT: call void undef(i32* %2, i32* %2) +; CHECK-NEXT: %1 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]* +; CHECK-NEXT: %2 = getelementptr inbounds [505 x i32], [505 x i32]* %1, i64 0, i64 0 +; CHECK-NEXT: %3 = addrspacecast [505 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k5.lds.t, %llvm.amdgcn.kernel.k5.lds.t addrspace(3)* @llvm.amdgcn.kernel.k5.lds, i32 0, i32 0) to [505 x i32]* +; CHECK-NEXT: %4 = getelementptr inbounds [505 x i32], [505 x i32]* %3, i64 0, i64 0 +; CHECK-NEXT: call void undef(i32* %2, i32* %4) ; call void undef(i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0), i32* getelementptr inbounds ([505 x i32], [505 x i32]* addrspacecast ([505 x i32] addrspace(3)* @lds.4 to [505 x i32]*), i64 0, i64 0)) ret void @@ -119,13 +121,15 @@ ; Both the *value* and *pointer* operands of store instruction are constant expressions, and ; both of these constant expression paths use same lds - @lds.5. Hence both of these constant -; expression operands of store should be replaced by corresponding instruction sequence. +; expression operands of store should be replaced by equivalent instruction sequences. define amdgpu_kernel void @k6() { ; CHECK-LABEL: @k6( -; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2 -; CHECK-NEXT: %2 = ptrtoint i32 addrspace(3)* %1 to i32 -; CHECK-NEXT: store i32 %2, i32 addrspace(3)* %1, align 8 -; CHECK-NEXT: ret void + +; CHECK-NEXT: %1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2 +; CHECK-NEXT: %2 = ptrtoint i32 addrspace(3)* %1 to i32 +; CHECK-NEXT: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k6.lds.t, %llvm.amdgcn.kernel.k6.lds.t addrspace(3)* @llvm.amdgcn.kernel.k6.lds, i32 0, i32 0), i32 0, i32 2 +; CHECK-NEXT: store i32 %2, i32 addrspace(3)* %3, align 8 +; CHECK-NEXT: ret void ; store i32 ptrtoint (i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) to i32), i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @lds.5, i32 0, i32 2) ret void diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll --- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll @@ -18,9 +18,15 @@ ; CHECK-LABEL: @test ; CHECK: store i8 3, i8 addrspace(3)* %0, align 4, !alias.scope !0, !noalias !3 -; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %2, i8 addrspace(3)* noundef align 1 dereferenceable(3) %1, i64 3, i1 false), !alias.scope !5, !noalias !6 +; CHECK: %1 = getelementptr +; CHECK: %2 = getelementptr +; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %1, i8 addrspace(3)* noundef align 1 dereferenceable(3) %2, i64 3, i1 false), !alias.scope !5, !noalias !6 ; CHECK: %4 = load i8, i8 addrspace(3)* %3, align 4, !alias.scope !3, !noalias !0 -; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %7, i8 addrspace(3)* noundef align 1 dereferenceable(3) %6, i64 3, i1 false), !alias.scope !5, !noalias !6 +; CHECK: %5 = getelementptr +; CHECK: %6 = getelementptr +; CHECK: %7 = getelementptr +; CHECK: tail call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* noundef align 1 dereferenceable(3) %6, i8 addrspace(3)* noundef align 1 dereferenceable(3) %7, i64 3, i1 false), !alias.scope !5, !noalias !6 +; CHECK: %8 = getelementptr ; CHECK: %9 = load i8, i8 addrspace(3)* %8, align 4, !alias.scope !3, !noalias !0 define protected amdgpu_kernel void @test(i8 addrspace(1)* nocapture %ptr.coerce) local_unnamed_addr #0 { diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll @@ -20,7 +20,16 @@ ; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4 ; CHECK-LABEL: @get_func() -; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %0 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i32 addrspace(3)* +; CHECK: %1 = addrspacecast i32 addrspace(3)* %0 to i32* +; CHECK: %2 = ptrtoint i32* %1 to i64 +; CHECK: %3 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i32 addrspace(3)* +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %2, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: %8 = load i32, i32* %7, align 4 +; CHECK: ret i32 %8 define i32 @get_func() local_unnamed_addr #0 { entry: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -28,7 +37,16 @@ } ; CHECK-LABEL: @set_func(i32 %x) -; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 +; CHECK: %0 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)* +; CHECK: %1 = addrspacecast i32 addrspace(3)* %0 to i32* +; CHECK: %2 = ptrtoint i32* %1 to i64 +; CHECK: %3 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)* +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %2, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void define void @set_func(i32 %x) local_unnamed_addr #1 { entry: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 @@ -37,19 +55,28 @@ ; CHECK-LABEL: @timestwo() #0 ; CHECK-NOT: call void @llvm.donothing() -; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* -; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* -; CHECK: %3 = ptrtoint i32* %2 to i64 -; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3 -; CHECK: %5 = inttoptr i64 %4 to i32* -; CHECK: %ld = load i32, i32* %5, align 4 -; CHECK: %mul = mul i32 %ld, 2 -; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* -; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32* -; CHECK: %8 = ptrtoint i32* %7 to i64 -; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64) -; CHECK: %10 = inttoptr i64 %9 to i32* -; CHECK: store i32 %mul, i32* %10, align 4 + + +; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)* +; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32* +; CHECK: %3 = ptrtoint i32* %2 to i64 +; CHECK: %4 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* +; CHECK: %5 = addrspacecast i32 addrspace(3)* %4 to i32* +; CHECK: %6 = ptrtoint i32* %5 to i64 +; CHECK: %7 = add i64 %3, %6 +; CHECK: %8 = inttoptr i64 %7 to i32* +; CHECK: %ld = load i32, i32* %8, align 4 +; CHECK: %mul = mul i32 %ld, 2 +; CHECK: %9 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)* +; CHECK: %10 = addrspacecast i32 addrspace(3)* %9 to i32* +; CHECK: %11 = ptrtoint i32* %10 to i64 +; CHECK: %12 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)* +; CHECK: %13 = addrspacecast i32 addrspace(3)* %12 to i32* +; CHECK: %14 = ptrtoint i32* %13 to i64 +; CHECK: %15 = add i64 %11, %14 +; CHECK: %16 = inttoptr i64 %15 to i32* +; CHECK: store i32 %mul, i32* %16, align 4 +; CHECK: ret void define amdgpu_kernel void @timestwo() { %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 %mul = mul i32 %ld, 2