diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll --- a/clang/test/CodeGen/thinlto-distributed-newpm.ll +++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll @@ -91,13 +91,13 @@ ; CHECK-O3: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: SROA on main -; These next two can appear in any order since they are accessed as parameters +; These next three can appear in any order since they are accessed as parameters ; on the same call to SROA::runImpl +; CHECK-O2-DAG: Running analysis: TargetIRAnalysis on main ; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on main ; CHECK-O-DAG: Running analysis: AssumptionAnalysis on main ; CHECK-O: Running pass: EarlyCSEPass on main ; CHECK-O: Running analysis: TargetLibraryAnalysis on main -; CHECK-O2: Running analysis: TargetIRAnalysis on main ; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running analysis: AAManager on main ; CHECK-O: Running analysis: BasicAA on main diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu --- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu @@ -1,37 +1,52 @@ -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s #include "Inputs/cuda.h" // Coerced struct from `struct S` without all generic pointers lowered into // global ones. -// CHECK: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } -// CHECK: %struct.T.coerce = type { [2 x float addrspace(1)*] } +// COMMON: %struct.S.coerce = type { i32 addrspace(1)*, float addrspace(1)* } +// COMMON: %struct.T.coerce = type { [2 x float addrspace(1)*] } // On the host-side compilation, generic pointer won't be coerced. // HOST-NOT: %struct.S.coerce // HOST-NOT: %struct.T.coerce -// CHECK: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)* %x.coerce) // HOST: define void @_Z22__device_stub__kernel1Pi(i32* %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: inttoptr __global__ void kernel1(int *x) { x[0]++; } -// CHECK: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)* nonnull align 4 dereferenceable(4) %x.coerce) // HOST: define void @_Z22__device_stub__kernel2Ri(i32* nonnull align 4 dereferenceable(4) %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce) +// CHECK: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel2(int &x) { x++; } -// CHECK: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) // HOST: define void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)* %x, i32 addrspace(1)* %y) +// CHECK-LABEL: define amdgpu_kernel void @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* __global__ void kernel3(__attribute__((address_space(2))) int *x, __attribute__((address_space(1))) int *y) { y[0] = x[0]; } -// CHECK: define void @_Z4funcPi(i32* %x) +// COMMON-LABEL: define void @_Z4funcPi(i32*{{.*}} %x) +// CHECK-NOT: = addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]* +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __device__ void func(int *x) { x[0]++; } @@ -42,16 +57,22 @@ }; // `by-val` struct will be coerced into a similar struct with all generic // pointers lowerd into global ones. -// CHECK: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) // HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S.coerce %s.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel4(struct S s) { s.x[0]++; s.y[0] += 1.f; } // If a pointer to struct is passed, only the pointer itself is coerced into the global one. -// CHECK: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)* %s.coerce) // HOST: define void @_Z22__device_stub__kernel5P1S(%struct.S* %s) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel5(struct S *s) { s->x[0]++; s->y[0] += 1.f; @@ -61,16 +82,22 @@ float *x[2]; }; // `by-val` array is also coerced. -// CHECK: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) // HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T.coerce %t.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel6(struct T t) { t.x[0][0] += 1.f; t.x[1][0] += 2.f; } // Check that coerced pointers retain the noalias attribute when qualified with __restrict. -// CHECK: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias %x.coerce) // HOST: define void @_Z22__device_stub__kernel7Pi(i32* noalias %x) +// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce) +// OPT-NOT: alloca +// OPT-NOT: ptrtoint +// OPT-NOT: inttoptr __global__ void kernel7(int *__restrict x) { x[0]++; } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -376,6 +376,8 @@ bool collectFlatAddressOperands(SmallVectorImpl &OpIndexes, Intrinsic::ID IID) const; + bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const; + /// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p /// NewV, which has a different address space. This should happen for every /// operand index that collectFlatAddressOperands returned for the intrinsic. @@ -1245,6 +1247,7 @@ virtual unsigned getFlatAddressSpace() = 0; virtual bool collectFlatAddressOperands(SmallVectorImpl &OpIndexes, Intrinsic::ID IID) const = 0; + virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0; virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const = 0; @@ -1517,6 +1520,10 @@ return Impl.collectFlatAddressOperands(OpIndexes, IID); } + bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override { + return Impl.isNoopAddrSpaceCast(FromAS, ToAS); + } + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override { return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -87,6 +87,8 @@ return false; } + bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; } + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const { return nullptr; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -222,6 +222,10 @@ return false; } + bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { + return getTLI()->isNoopAddrSpaceCast(FromAS, ToAS); + } + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const { return nullptr; diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h --- a/llvm/include/llvm/Transforms/Scalar/SROA.h +++ b/llvm/include/llvm/Transforms/Scalar/SROA.h @@ -30,6 +30,7 @@ class LLVMContext; class PHINode; class SelectInst; +class TargetTransformInfo; class Use; /// A private "module" namespace for types and utilities used by SROA. These @@ -65,6 +66,7 @@ LLVMContext *C = nullptr; DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; + const TargetTransformInfo *TTI = nullptr; /// Worklist of alloca instructions to simplify. /// @@ -120,7 +122,8 @@ /// Helper used by both the public run method and by the legacy pass. PreservedAnalyses runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC); + AssumptionCache &RunAC, + const TargetTransformInfo &RunTTI); bool presplitLoadsAndStores(AllocaInst &AI, sroa::AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, sroa::AllocaSlices &AS, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -291,6 +291,11 @@ return TTIImpl->collectFlatAddressOperands(OpIndexes, IID); } +bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS, + unsigned ToAS) const { + return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS); +} + Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace( IntrinsicInst *II, Value *OldV, Value *NewV) const { return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -41,6 +41,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -1677,7 +1678,9 @@ /// ensure that we only try to convert viable values. The strategy is that we /// will peel off single element struct and array wrappings to get to an /// underlying value, and convert that value. -static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { +static bool canConvertValue(const DataLayout &DL, + const TargetTransformInfo &TTI, Type *OldTy, + Type *NewTy) { if (OldTy == NewTy) return true; @@ -1703,8 +1706,11 @@ NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { if (NewTy->isPointerTy() && OldTy->isPointerTy()) { - return cast(NewTy)->getPointerAddressSpace() == - cast(OldTy)->getPointerAddressSpace(); + // Pointers are convertible if they have the same address space or that + // address space casting is a no-op. + unsigned OldAS = cast(OldTy)->getPointerAddressSpace(); + unsigned NewAS = cast(NewTy)->getPointerAddressSpace(); + return OldAS == NewAS || TTI.isNoopAddrSpaceCast(OldAS, NewAS); } // We can convert integers to integral pointers, but not to non-integral @@ -1729,10 +1735,11 @@ /// This will try various different casting techniques, such as bitcasts, /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test /// two types for viability with this routine. -static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, - Type *NewTy) { +static Value *convertValue(const DataLayout &DL, const TargetTransformInfo &TTI, + IRBuilderTy &IRB, Value *V, Type *NewTy) { Type *OldTy = V->getType(); - assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + assert(canConvertValue(DL, TTI, OldTy, NewTy) && + "Value not convertable to type"); if (OldTy == NewTy) return V; @@ -1772,6 +1779,9 @@ return IRB.CreatePtrToInt(V, NewTy); } + if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) + return IRB.CreatePointerBitCastOrAddrSpaceCast(V, NewTy); + return IRB.CreateBitCast(V, NewTy); } @@ -1782,7 +1792,8 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, - const DataLayout &DL) { + const DataLayout &DL, + const TargetTransformInfo &TTI) { // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); @@ -1826,7 +1837,7 @@ assert(LTy->isIntegerTy()); LTy = SplitIntTy; } - if (!canConvertValue(DL, SliceTy, LTy)) + if (!canConvertValue(DL, TTI, SliceTy, LTy)) return false; } else if (StoreInst *SI = dyn_cast(U->getUser())) { if (SI->isVolatile()) @@ -1836,7 +1847,7 @@ assert(STy->isIntegerTy()); STy = SplitIntTy; } - if (!canConvertValue(DL, STy, SliceTy)) + if (!canConvertValue(DL, TTI, STy, SliceTy)) return false; } else { return false; @@ -1854,7 +1865,8 @@ /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, + const TargetTransformInfo &TTI) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; @@ -1953,11 +1965,11 @@ ElementSize /= 8; for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, TTI)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, TTI)) return false; return true; @@ -1973,11 +1985,9 @@ /// /// This implements the necessary checking for the \c isIntegerWideningViable /// test below on a single slice of the alloca. -static bool isIntegerWideningViableForSlice(const Slice &S, - uint64_t AllocBeginOffset, - Type *AllocaTy, - const DataLayout &DL, - bool &WholeAllocaOp) { +static bool isIntegerWideningViableForSlice( + const Slice &S, uint64_t AllocBeginOffset, Type *AllocaTy, + const DataLayout &DL, const TargetTransformInfo &TTI, bool &WholeAllocaOp) { uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize(); uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; @@ -2009,7 +2019,7 @@ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize()) return false; } else if (RelBegin != 0 || RelEnd != Size || - !canConvertValue(DL, AllocaTy, LI->getType())) { + !canConvertValue(DL, TTI, AllocaTy, LI->getType())) { // Non-integer loads need to be convertible from the alloca type so that // they are promotable. return false; @@ -2034,7 +2044,7 @@ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize()) return false; } else if (RelBegin != 0 || RelEnd != Size || - !canConvertValue(DL, ValueTy, AllocaTy)) { + !canConvertValue(DL, TTI, ValueTy, AllocaTy)) { // Non-integer stores need to be convertible to the alloca type so that // they are promotable. return false; @@ -2061,7 +2071,8 @@ /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, - const DataLayout &DL) { + const DataLayout &DL, + const TargetTransformInfo &TTI) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize(); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) @@ -2075,8 +2086,8 @@ // be converted to the alloca type, whatever that is. We don't want to force // the alloca itself to have an integer type if there is a more suitable one. Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); - if (!canConvertValue(DL, AllocaTy, IntTy) || - !canConvertValue(DL, IntTy, AllocaTy)) + if (!canConvertValue(DL, TTI, AllocaTy, IntTy) || + !canConvertValue(DL, TTI, IntTy, AllocaTy)) return false; // While examining uses, we ensure that the alloca has a covering load or @@ -2090,12 +2101,12 @@ P.begin() != P.end() ? false : DL.isLegalInteger(SizeInBits); for (const Slice &S : P) - if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, + if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, TTI, WholeAllocaOp)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, + if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, TTI, WholeAllocaOp)) return false; @@ -2247,6 +2258,7 @@ using Base = InstVisitor; const DataLayout &DL; + const TargetTransformInfo &TTI; AllocaSlices &AS; SROA &Pass; AllocaInst &OldAI, &NewAI; @@ -2296,14 +2308,14 @@ IRBuilderTy IRB; public: - AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, - AllocaInst &OldAI, AllocaInst &NewAI, - uint64_t NewAllocaBeginOffset, + AllocaSliceRewriter(const DataLayout &DL, const TargetTransformInfo &TTI, + AllocaSlices &AS, SROA &Pass, AllocaInst &OldAI, + AllocaInst &NewAI, uint64_t NewAllocaBeginOffset, uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, VectorType *PromotableVecTy, SmallSetVector &PHIUsers, SmallSetVector &SelectUsers) - : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), + : DL(DL), TTI(TTI), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), NewAllocaBeginOffset(NewAllocaBeginOffset), NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAI.getAllocatedType()), @@ -2449,7 +2461,7 @@ assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, NewAI.getAlign(), "load"); - V = convertValue(DL, IRB, V, IntTy); + V = convertValue(DL, TTI, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { @@ -2490,7 +2502,7 @@ V = rewriteIntegerLoad(LI); } else if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && - (canConvertValue(DL, NewAllocaTy, TargetTy) || + (canConvertValue(DL, TTI, NewAllocaTy, TargetTy) || (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy()))) { LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, @@ -2543,7 +2555,7 @@ V = NewLI; IsPtrAdjusted = true; } - V = convertValue(DL, IRB, V, TargetTy); + V = convertValue(DL, TTI, IRB, V, TargetTy); if (IsSplit) { assert(!LI.isVolatile()); @@ -2589,7 +2601,7 @@ ? ElementTy : FixedVectorType::get(ElementTy, NumElements); if (V->getType() != SliceTy) - V = convertValue(DL, IRB, V, SliceTy); + V = convertValue(DL, TTI, IRB, V, SliceTy); // Mix in the existing elements. Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, @@ -2612,12 +2624,12 @@ IntTy->getBitWidth()) { Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, NewAI.getAlign(), "oldload"); - Old = convertValue(DL, IRB, Old, IntTy); + Old = convertValue(DL, TTI, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } - V = convertValue(DL, IRB, V, NewAllocaTy); + V = convertValue(DL, TTI, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); @@ -2665,7 +2677,7 @@ StoreInst *NewSI; if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && - (canConvertValue(DL, V->getType(), NewAllocaTy) || + (canConvertValue(DL, TTI, V->getType(), NewAllocaTy) || (IsStorePastEnd && NewAllocaTy->isIntegerTy() && V->getType()->isIntegerTy()))) { // If this is an integer store past the end of slice (and thus the bytes @@ -2680,7 +2692,7 @@ V = IRB.CreateTrunc(V, AITy, "load.trunc"); } - V = convertValue(DL, IRB, V, NewAllocaTy); + V = convertValue(DL, TTI, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile()); } else { @@ -2775,7 +2787,7 @@ const auto Len = C->getZExtValue(); auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext()); auto *SrcTy = FixedVectorType::get(Int8Ty, Len); - return canConvertValue(DL, SrcTy, AllocaTy) && + return canConvertValue(DL, TTI, SrcTy, AllocaTy) && DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize()); }(); @@ -2812,7 +2824,7 @@ Value *Splat = getIntegerSplat( II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8); - Splat = convertValue(DL, IRB, Splat, ElementTy); + Splat = convertValue(DL, TTI, IRB, Splat, ElementTy); if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); @@ -2831,14 +2843,14 @@ EndOffset != NewAllocaBeginOffset)) { Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, NewAI.getAlign(), "oldload"); - Old = convertValue(DL, IRB, Old, IntTy); + Old = convertValue(DL, TTI, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, V, Offset, "insert"); } else { assert(V->getType() == IntTy && "Wrong type for an alloca wide integer!"); } - V = convertValue(DL, IRB, V, AllocaTy); + V = convertValue(DL, TTI, IRB, V, AllocaTy); } else { // Established these invariants above. assert(NewBeginOffset == NewAllocaBeginOffset); @@ -2849,7 +2861,7 @@ if (VectorType *AllocaVecTy = dyn_cast(AllocaTy)) V = getVectorSplat(V, AllocaVecTy->getNumElements()); - V = convertValue(DL, IRB, V, AllocaTy); + V = convertValue(DL, TTI, IRB, V, AllocaTy); } StoreInst *New = @@ -3023,7 +3035,7 @@ } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, NewAI.getAlign(), "load"); - Src = convertValue(DL, IRB, Src, IntTy); + Src = convertValue(DL, TTI, IRB, Src, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { @@ -3041,10 +3053,10 @@ } else if (IntTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, NewAI.getAlign(), "oldload"); - Old = convertValue(DL, IRB, Old, IntTy); + Old = convertValue(DL, TTI, IRB, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); - Src = convertValue(DL, IRB, Src, NewAllocaTy); + Src = convertValue(DL, TTI, IRB, Src, NewAllocaTy); } StoreInst *Store = cast( @@ -4231,10 +4243,10 @@ SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size()); - bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); + bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL, *TTI); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, *TTI); if (VecTy) SliceTy = VecTy; @@ -4277,7 +4289,7 @@ SmallSetVector PHIUsers; SmallSetVector SelectUsers; - AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), + AllocaSliceRewriter Rewriter(DL, *TTI, AS, *this, AI, *NewAI, P.beginOffset(), P.endOffset(), IsIntegerPromotable, VecTy, PHIUsers, SelectUsers); bool Promotable = true; @@ -4653,11 +4665,13 @@ } PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, - AssumptionCache &RunAC) { + AssumptionCache &RunAC, + const TargetTransformInfo &RunTTI) { LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); DT = &RunDT; AC = &RunAC; + TTI = &RunTTI; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4711,7 +4725,8 @@ PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) { return runImpl(F, AM.getResult(F), - AM.getResult(F)); + AM.getResult(F), + AM.getResult(F)); } /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. @@ -4735,13 +4750,15 @@ auto PA = Impl.runImpl( F, getAnalysis().getDomTree(), - getAnalysis().getAssumptionCache(F)); + getAnalysis().getAssumptionCache(F), + getAnalysis().getTTI(F)); return !PA.areAllPreserved(); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.setPreservesCFG(); } @@ -4757,5 +4774,6 @@ "Scalar Replacement Of Aggregates", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", false, false) diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -123,15 +123,15 @@ ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}> ; CHECK-O-NEXT: Starting {{.*}}Function pass manager run. ; CHECK-O-NEXT: Running pass: SROA -; These next two can appear in any order since they are accessed as parameters +; These next three can appear in any order since they are accessed as parameters ; on the same call to SROA::runImpl +; CHECK-O1-DAG: Running analysis: TargetIRAnalysis on foo +; CHECK-O2-DAG: Running analysis: TargetIRAnalysis on foo +; CHECK-Os-DAG: Running analysis: TargetIRAnalysis on foo +; CHECK-Oz-DAG: Running analysis: TargetIRAnalysis on foo ; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on foo ; CHECK-O-DAG: Running analysis: AssumptionAnalysis on foo ; CHECK-O-NEXT: Running pass: EarlyCSEPass -; CHECK-O1-NEXT: Running analysis: TargetIRAnalysis on foo -; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis on foo -; CHECK-Os-NEXT: Running analysis: TargetIRAnalysis on foo -; CHECK-Oz-NEXT: Running analysis: TargetIRAnalysis on foo ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass diff --git a/llvm/test/Transforms/SROA/noop-addrspacecast.ll b/llvm/test/Transforms/SROA/noop-addrspacecast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SROA/noop-addrspacecast.ll @@ -0,0 +1,19 @@ +; RUN: opt -S -o - -sroa %s | FileCheck %s +; RUN: opt -S -o - -passes=sroa %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" +target triple = "amdgcn-amd-amdhsa" + +; CHECK-LABEL: @noop_addrspacecast( +; CHECK-NEXT: = addrspacecast i32 addrspace(1)* %{{.*}} to i32* +; CHECK-NEXT: store i32 0, i32* %{{.*}} +; CHECK-NEXT: ret void +define void @noop_addrspacecast(i32 addrspace(1)* %x.coerce) { + %x = alloca i32*, align 8, addrspace(5) + %x1 = addrspacecast i32* addrspace(5)* %x to i32** + %x2 = bitcast i32** %x1 to i32 addrspace(1)** + store i32 addrspace(1)* %x.coerce, i32 addrspace(1)** %x2 + %x3 = load i32*, i32** %x1 + store i32 0, i32* %x3 + ret void +}