Index: llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -32,22 +32,6 @@ STATISTIC(NumDeadStore, "Number of dead stores eliminated"); STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global"); -/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to -/// some part of a constant global variable. This intentionally only accepts -/// constant expressions because we can't rewrite arbitrary instructions. -static bool pointsToConstantGlobal(Value *V) { - if (GlobalVariable *GV = dyn_cast(V)) - return GV->isConstant(); - - if (ConstantExpr *CE = dyn_cast(V)) { - if (CE->getOpcode() == Instruction::BitCast || - CE->getOpcode() == Instruction::AddrSpaceCast || - CE->getOpcode() == Instruction::GetElementPtr) - return pointsToConstantGlobal(CE->getOperand(0)); - } - return false; -} - /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) /// pointer to an alloca. Ignore any reads of the pointer, return false if we /// see any stores or other unknown uses. If we see pointer arithmetic, keep @@ -56,7 +40,8 @@ /// the alloca, and if the source pointer is a pointer to a constant global, we /// can optimize this. static bool -isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, +isOnlyCopiedFromConstantMemory(AliasAnalysis *AA, + Value *V, MemTransferInst *&TheCopy, SmallVectorImpl &ToDelete) { // We track lifetime intrinsics as we encounter them. If we decide to go // ahead and replace the value with the global, this lets the caller quickly @@ -145,7 +130,7 @@ if (U.getOperandNo() != 0) return false; // If the source of the memcpy/move is not a constant global, reject it. - if (!pointsToConstantGlobal(MI->getSource())) + if (!AA->pointsToConstantMemory(MI->getSource())) return false; // Otherwise, the transform is safe. Remember the copy instruction. @@ -159,10 +144,11 @@ /// modified by a copy from a constant global. If we can prove this, we can /// replace any uses of the alloca with uses of the global directly. static MemTransferInst * -isOnlyCopiedFromConstantGlobal(AllocaInst *AI, +isOnlyCopiedFromConstantMemory(AliasAnalysis *AA, + AllocaInst *AI, SmallVectorImpl &ToDelete) { MemTransferInst *TheCopy = nullptr; - if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete)) + if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete)) return TheCopy; return nullptr; } @@ -391,13 +377,13 @@ if (AI.getAlignment()) { // Check to see if this allocation is only modified by a memcpy/memmove from - // a constant global whose alignment is equal to or exceeds that of the - // allocation. If this is the case, we can change all users to use - // the constant global instead. This is commonly produced by the CFE by - // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' - // is only subsequently read. + // a constant whose alignment is equal to or exceeds that of the allocation. + // If this is the case, we can change all users to use the constant global + // instead. This is commonly produced by the CFE by constructs like "void + // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently + // read. SmallVector ToDelete; - if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) { + if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) { MaybeAlign AllocaAlign = AI.getAlign(); Align SourceAlign = getOrEnforceKnownAlignment( Copy->getSource(), AllocaAlign, DL, &AI, &AC, &DT); @@ -407,12 +393,12 @@ LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) eraseInstFromFunction(*ToDelete[i]); - Constant *TheSrc = cast(Copy->getSource()); + Value *TheSrc = Copy->getSource(); auto *SrcTy = TheSrc->getType(); auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(), SrcTy->getPointerAddressSpace()); - Constant *Cast = - ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy); + Value *Cast = + Builder.CreatePointerBitCastOrAddrSpaceCast(TheSrc, DestTy); if (AI.getType()->getPointerAddressSpace() == SrcTy->getPointerAddressSpace()) { Instruction *NewI = replaceInstUsesWith(AI, Cast); Index: llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-aa-wrapper -amdgpu-aa -instcombine -o - %s | FileCheck %s + +; Make sure the optimization from memcpy-from-global.ll happens, but +; the constant source is not a global variable. + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +; Simple memcpy to alloca from constant address space argument. +define i8 @memcpy_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { +; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca( +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %alloca = alloca [32 x i8], align 4, addrspace(5) + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* + call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %arg.cast, i64 32, i1 false) + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx + %load = load i8, i8 addrspace(5)* %gep + ret i8 %load +} + +; Simple memcpy to alloca from constant address space intrinsic call +define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(i8 addrspace(1)* %out, i32 %idx) { +; CHECK-LABEL: @memcpy_constant_intrinsic_ptr_to_alloca( +; CHECK-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERNARG_SEGMENT_PTR]], i64 [[TMP1]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 +; CHECK-NEXT: store i8 [[LOAD]], i8 addrspace(1)* [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; + %alloca = alloca [32 x i8], align 4, addrspace(5) + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %kernarg.segment.ptr = call dereferenceable(32) align 16 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %kernarg.segment.ptr, i64 32, i1 false) + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx + %load = load i8, i8 addrspace(5)* %gep + store i8 %load, i8 addrspace(1)* %out + ret void +} + +; Alloca is written through a flat pointer +define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { +; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat( +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %alloca = alloca [32 x i8], align 4, addrspace(5) + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %alloca.cast.asc = addrspacecast i8 addrspace(5)* %alloca.cast to i8* + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* + call void @llvm.memcpy.p0i8.p4i8.i64(i8* %alloca.cast.asc, i8 addrspace(4)* %arg.cast, i64 32, i1 false) + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx + %load = load i8, i8 addrspace(5)* %gep + ret i8 %load +} + +; Alloca is only addressed through flat pointer. +define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { +; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2( +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5) +; CHECK-NEXT: [[ALLOCA_CAST1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 0 +; CHECK-NEXT: [[ALLOCA_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[ALLOCA_CAST1]] to i8* +; CHECK-NEXT: [[ARG_CAST:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 0 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 1 dereferenceable(32) [[ALLOCA_CAST]], i8 addrspace(4)* align 4 dereferenceable(32) [[ARG_CAST]], i64 32, i1 false) +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[IDX:%.*]] +; CHECK-NEXT: [[GEP:%.*]] = addrspacecast i8 addrspace(5)* [[GEP2]] to i8* +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %alloca = alloca [32 x i8], align 4, addrspace(5) + %alloca.cast.asc = addrspacecast [32 x i8] addrspace(5)* %alloca to [32 x i8]* + %alloca.cast = bitcast [32 x i8]* %alloca.cast.asc to i8* + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* + call void @llvm.memcpy.p0i8.p4i8.i64(i8* %alloca.cast, i8 addrspace(4)* %arg.cast, i64 32, i1 false) + %gep = getelementptr inbounds [32 x i8], [32 x i8]* %alloca.cast.asc, i32 0, i32 %idx + %load = load i8, i8* %gep + ret i8 %load +} + +declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 +declare void @llvm.memcpy.p0i8.p4i8.i64(i8* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 +declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1 + +attributes #0 = { argmemonly nounwind willreturn } +attributes #1 = { nounwind readnone speculatable }