diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -56,20 +56,24 @@ int *x; float *y; }; -// `by-val` struct will be coerced into a similar struct with all generic -// pointers lowerd into global ones. +// `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect +// by-val). However, the enhanced address inferring pass should be able to +// assume they are global pointers. +// // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) // COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S addrspace(4)*{{.*}} byref(%struct.S) align 8 %0) // OPT: [[R0:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0 // OPT: [[P0:%.*]] = load i32*, i32* addrspace(4)* [[R0]], align 8 +// OPT: [[G0:%.*]] = addrspacecast i32* [[P0]] to i32 addrspace(1)* // OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8 -// OPT: [[V0:%.*]] = load i32, i32* [[P0]], align 4 +// OPT: [[G1:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)* +// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1 -// OPT: store i32 [[INC]], i32* [[P0]], align 4 -// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4 +// OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4 +// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4 // OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00 -// OPT: store float [[ADD]], float* [[P1]], align 4 +// OPT: store float [[ADD]], float addrspace(1)* [[G1]], align 4 // OPT: ret void __global__ void kernel4(struct S s) { s.x[0]++; @@ -87,19 +91,24 @@ struct T { float *x[2]; }; -// `by-val` array is also coerced. +// `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect +// by-val). However, the enhanced address inferring pass should be able to +// assume they are global pointers. +// // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) // COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T addrspace(4)*{{.*}} byref(%struct.T) align 8 %0) // OPT: [[R0:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 0 // OPT: [[P0:%.*]] = load float*, float* addrspace(4)* [[R0]], align 8 +// OPT: [[G0:%.*]] = addrspacecast float* [[P0]] to float addrspace(1)* // OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8 -// OPT: [[V0:%.*]] = load float, float* [[P0]], align 4 +// OPT: [[G1:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)* +// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00 -// OPT: store float [[ADD0]], float* [[P0]], align 4 -// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4 +// OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4 +// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4 // OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00 -// OPT: store float [[ADD1]], float* [[P1]], align 4 +// OPT: store float [[ADD1]], float addrspace(1)* [[G1]], align 4 // OPT: ret void __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -465,9 +465,12 @@ Using the constant address space indicates that the data will not change during the execution of the kernel. This allows scalar read instructions to - be used. The vector and scalar L1 caches are invalidated of volatile data - before each kernel dispatch execution to allow constant memory to change - values between kernel dispatches. + be used. As the constant address space could only be modified on the host + side, a generic pointer loaded from the constant address space is safe to be + assumed as a global pointer since only the device global memory is visible + and managed on the host side. The vector and scalar L1 caches are invalidated + of volatile data before each kernel dispatch execution to allow constant + memory to change values between kernel dispatches. **Region** The region address space uses the hardware Global Data Store (GDS). All diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -387,6 +387,8 @@ bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; + unsigned getAssumedAddrSpace(const Value *V) const; + /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p /// NewV, which has a different address space. This should happen for every /// operand index that collectFlatAddressOperands returned for the intrinsic. @@ -1384,6 +1386,7 @@ virtual bool collectFlatAddressOperands(SmallVectorImpl &OpIndexes, Intrinsic::ID IID) const = 0; virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0; + virtual unsigned getAssumedAddrSpace(const Value *V) const = 0; virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const = 0; @@ -1677,6 +1680,10 @@ return Impl.isNoopAddrSpaceCast(FromAS, ToAS); } + unsigned getAssumedAddrSpace(const Value *V) const override { + return Impl.getAssumedAddrSpace(V); + } + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override { return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -89,6 +89,8 @@ bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; } + unsigned getAssumedAddrSpace(const Value *V) const { return -1; } + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const { return nullptr; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -224,6 +224,10 @@ return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); } + unsigned getAssumedAddrSpace(const Value *V) const { + return getTLI()->getTargetMachine().getAssumedAddrSpace(V); + } + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const { return nullptr; diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -284,6 +284,14 @@ return false; } + /// If the specified generic pointer could be assumed as a pointer to a + /// specific address space, return that address space. + /// + /// Under offloading programming, the offloading target may be passed with + /// values only prepared on the host side and could assume certain + /// properties. + virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; } + /// Get a \c TargetIRAnalysis appropriate for the target. /// /// This is used to construct the new pass manager's target IR analysis pass, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -297,6 +297,10 @@ return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS); } +unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const { + return TTIImpl->getAssumedAddrSpace(V); +} + Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace( IntrinsicInst *II, Value *OldV, Value *NewV) const { return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -64,6 +64,8 @@ } bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + + unsigned getAssumedAddrSpace(const Value *V) const override; }; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -527,6 +527,25 @@ AMDGPU::isFlatGlobalAddrSpace(DestAS); } +unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + const auto *LD = dyn_cast(V); + if (!LD) + return -1; + + // It must be a generic pointer loaded. + assert(V->getType()->isPointerTy() && + V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); + + const auto *Ptr = LD->getPointerOperand(); + if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return -1; + // For a generic pointer loaded from the constant memory, it could be assumed + // as a global pointer since the constant memory is only populated on the + // host side. As implied by the offload programming model, only global + // pointers could be referenced on the host side. + return AMDGPUAS::GLOBAL_ADDRESS; +} + TargetTransformInfo R600TargetMachine::getTargetTransformInfo(const Function &F) { return TargetTransformInfo(R600TTIImpl(this, F)); diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -286,7 +286,8 @@ case Instruction::IntToPtr: return isNoopPtrIntCastPair(Op, DL, TTI); default: - return false; + // That value is an address expression if it has an assumed address space. + return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace; } } @@ -394,8 +395,8 @@ return; } - if (isAddressExpression(*V, *DL, TTI) && - V->getType()->getPointerAddressSpace() == FlatAddrSpace) { + if (V->getType()->getPointerAddressSpace() == FlatAddrSpace && + isAddressExpression(*V, *DL, TTI)) { if (Visited.insert(V).second) { PostorderStack.emplace_back(V, false); @@ -478,9 +479,12 @@ } // Otherwise, adds its operands to the stack and explores them. PostorderStack.back().setInt(true); - for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) { - appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack, - Visited); + // Skip values with an assumed address space. + if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) { + for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) { + appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack, + Visited); + } } } return Postorder; @@ -555,6 +559,16 @@ return nullptr; } + unsigned AS = TTI->getAssumedAddrSpace(I); + if (AS != UninitializedAddressSpace) { + // For the assumed address space, insert an `addrspacecast` to make that + // explicit. + auto *NewPtrTy = I->getType()->getPointerElementType()->getPointerTo(AS); + auto *NewI = new AddrSpaceCastInst(I, NewPtrTy); + NewI->insertAfter(I); + return NewI; + } + // Computes the converted pointer operands. SmallVector NewPointerOperands; for (const Use &OperandUse : I->operands()) { @@ -700,8 +714,8 @@ const ValueToValueMapTy &ValueWithNewAddrSpace, SmallVectorImpl *UndefUsesToFix) const { // All values in Postorder are flat address expressions. - assert(isAddressExpression(*V, *DL, TTI) && - V->getType()->getPointerAddressSpace() == FlatAddrSpace); + assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace && + isAddressExpression(*V, *DL, TTI)); if (Instruction *I = dyn_cast(V)) { Value *NewV = cloneInstructionWithNewAddressSpace( @@ -848,15 +862,24 @@ else NewAS = joinAddressSpaces(Src0AS, Src1AS); } else { - for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) { - auto I = InferredAddrSpace.find(PtrOperand); - unsigned OperandAS = I != InferredAddrSpace.end() ? - I->second : PtrOperand->getType()->getPointerAddressSpace(); - - // join(flat, *) = flat. So we can break if NewAS is already flat. - NewAS = joinAddressSpaces(NewAS, OperandAS); - if (NewAS == FlatAddrSpace) - break; + unsigned AS = TTI->getAssumedAddrSpace(&V); + if (AS != UninitializedAddressSpace) { + // Use the assumed address space directly. + NewAS = AS; + } else { + // Otherwise, infer the address space from its pointer operands. + for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) { + auto I = InferredAddrSpace.find(PtrOperand); + unsigned OperandAS = + I != InferredAddrSpace.end() + ? I->second + : PtrOperand->getType()->getPointerAddressSpace(); + + // join(flat, *) = flat. So we can break if NewAS is already flat. + NewAS = joinAddressSpaces(NewAS, OperandAS); + if (NewAS == FlatAddrSpace) + break; + } } } @@ -1068,6 +1091,9 @@ } User *CurUser = U.getUser(); + // Skip if the current user is the new value itself. + if (CurUser == NewV) + continue; // Handle more complex cases like intrinsic that need to be remangled. if (auto *MI = dyn_cast(CurUser)) { if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV)) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -138,7 +138,7 @@ ; CHECK-NEXT: s_cselect_b32 s4, 1, 0 ; CHECK-NEXT: s_and_b32 s4, s4, 1 ; CHECK-NEXT: s_cmp_lg_u32 s4, 0 -; CHECK-NEXT: s_cbranch_scc1 BB4_6 +; CHECK-NEXT: s_cbranch_scc1 BB4_4 ; CHECK-NEXT: ; %bb.1: ; %bb2 ; CHECK-NEXT: s_getpc_b64 s[6:7] ; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4 @@ -150,23 +150,23 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s6 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: flat_load_dword v0, v[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: global_load_dword v0, v[0:1], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0 +; CHECK-NEXT: s_cbranch_vccnz BB4_3 ; CHECK-NEXT: ; %bb.2: ; %bb7 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: ; %bb.3: ; %bb8 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s4, 0 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] -; CHECK-NEXT: s_cbranch_execz BB4_5 -; CHECK-NEXT: ; %bb.4: ; %bb11 +; CHECK-NEXT: BB4_3: ; %bb8 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cselect_b32 s4, 1, 0 +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: s_cmp_lg_u32 s4, 0 +; CHECK-NEXT: s_cbranch_scc0 BB4_5 +; CHECK-NEXT: BB4_4: ; %bb12 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: BB4_5: ; %bb11 ; CHECK-NEXT: v_mov_b32_e32 v0, 4.0 ; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen -; CHECK-NEXT: BB4_5: ; %Flow -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: BB4_6: ; %bb12 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/assumed-addrspace.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/assumed-addrspace.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/assumed-addrspace.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces -o - %s | FileCheck %s + +@c0 = addrspace(4) global float* undef + +; CHECK-LABEL: @generic_ptr_from_constant +; CHECK: addrspacecast float* %p to float addrspace(1)* +; CHECK-NEXT: load float, float addrspace(1)* +define float @generic_ptr_from_constant() { + %p = load float*, float* addrspace(4)* @c0 + %v = load float, float* %p + ret float %v +} + +%struct.S = type { i32*, float* } + +; CHECK-LABEL: @generic_ptr_from_aggregate_argument +; CHECK: addrspacecast i32* %p0 to i32 addrspace(1)* +; CHECK: addrspacecast float* %p1 to float addrspace(1)* +; CHECK: load i32, i32 addrspace(1)* +; CHECK: store float %v1, float addrspace(1)* +; CHECK: ret +define amdgpu_kernel void @generic_ptr_from_aggregate_argument(%struct.S addrspace(4)* byref(%struct.S) align 8 %0) { + %f0 = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0 + %p0 = load i32*, i32* addrspace(4)* %f0 + %f1 = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1 + %p1 = load float*, float* addrspace(4)* %f1 + %v0 = load i32, i32* %p0 + %v1 = sitofp i32 %v0 to float + store float %v1, float* %p1 + ret void +}