diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -413,6 +413,18 @@ /// offset. int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction); + // Before finalizing the GEP with constant offset, we need to ensure that it + // is valid in the addressing modes in which it will be used. We cannot assume + // that the addressing mode used in the GEP is the same as the addressing mode + // used in all the uses, thus this function parses the recursive uses of the + // GEP, checking that the change is a valid addressing mode in all uses + bool traceAndCheckGEPUses(Value *V, int64_t AccumulativeByteOffset, + SmallVectorImpl &Visited); + + // Helper function to trace ptrtoint uses of the GEP + bool resolvePtrToInt(Value *V, int64_t AccumulativeByteOffset, + SmallVectorImpl &Visited); + /// Canonicalize array indices to pointer-size integers. This helps to /// simplify the logic of splitting a GEP. For example, if a + b is a /// pointer-size integer, we have @@ -842,6 +854,191 @@ return AccumulativeByteOffset; } +bool SeparateConstOffsetFromGEP::resolvePtrToInt( + Value *V, int64_t AccumulativeByteOffset, + SmallVectorImpl &Visited) { + typedef function_ref SpecialCase; + SmallVector UnableToTrace; + SmallVector ShouldNotTrace; + + // If it is used as argument in function call, we can not be sure how it + // will be used, we conserivately do not allow this + const SpecialCase isArgInCall = [](const Instruction *Inst, const Value *V) { + if (const CallBase *CB = dyn_cast(Inst)) { + // We are able to reason about intrinsics + return CB->getIntrinsicID() == Intrinsic::not_intrinsic; + } + return false; + }; + + UnableToTrace.push_back(isArgInCall); + + // We don't need to trace the bool from comparison instructions + const SpecialCase isCmpInst = [](const Instruction *Inst, const Value *V) { + const CmpInst *CI = dyn_cast(Inst); + return CI; + }; + + ShouldNotTrace.push_back(isCmpInst); + + bool AllUsesValid = true; + auto I = V->use_begin(); + auto E = V->use_end(); + + for (; I != E; I++) { + bool InterestingCase = false; + Instruction *Inst = dyn_cast(I->getUser()); + if (std::find(Visited.begin(), Visited.end(), Inst) != Visited.end()) + continue; + + Visited.push_back(Inst); + + for (auto &F : UnableToTrace) { + if (F(Inst, V)) { + AllUsesValid = false; + InterestingCase = true; + } + } + + for (auto &F : ShouldNotTrace) { + if (F(Inst, V)) + InterestingCase = true; + } + + if (IntToPtrInst *PTI = dyn_cast(Inst)) { + InterestingCase = true; + // Trace the pointer + AllUsesValid &= + traceAndCheckGEPUses(PTI, AccumulativeByteOffset, Visited); + } + + if (!InterestingCase) + AllUsesValid &= resolvePtrToInt(Inst, AccumulativeByteOffset, Visited); + + if (!AllUsesValid) + break; + } + return AllUsesValid; +} + +bool SeparateConstOffsetFromGEP::traceAndCheckGEPUses( + Value *V, int64_t AccumulativeByteOffset, + SmallVectorImpl &Visited) { + typedef function_ref SpecialCase; + SmallVector UnableToTrace; + + const SpecialCase isArgInCall = [](const Instruction *Inst, const Value *V) { + if (const CallBase *CB = dyn_cast(Inst)) { + bool ret = CB->getIntrinsicID() == Intrinsic::not_intrinsic; + if (ret) { + // If it is not an indirect call, then the ptr must be used as + // a function argument + if (!CB->isIndirectCall()) + return true; + // Check that the ptr isn't being used as the fptr that is indirectly + // called + auto ArgMatch = std::find(CB->arg_begin(), CB->arg_end(), V); + if (ArgMatch != CB->arg_end() && *ArgMatch != CB->getCalledOperand()) { + // The GEP ptr is passed as arg to function + return true; + } + } + } + return false; + }; + + UnableToTrace.push_back(isArgInCall); + + SmallVector MemoryAccessFromGEP; + + const SpecialCase isReadOrWrite = [](const Instruction *Inst, + const Value *V) { + return Inst->mayReadOrWriteMemory(); + }; + + const SpecialCase isFuncPtrInCall = [](const Instruction *Inst, + const Value *V) { + if (const CallBase *CB = dyn_cast(Inst)) { + // Direct calls do not use fptrs + if (!CB->isIndirectCall()) + return false; + // Is the value the fptr in indirect call? + return V == CB->getCalledOperand(); + } + return false; + }; + + const SpecialCase isPtrInTerminator = [](const Instruction *Inst, + const Value *V) { + return Inst->isTerminator(); + }; + + MemoryAccessFromGEP.push_back(isReadOrWrite); + MemoryAccessFromGEP.push_back(isFuncPtrInCall); + MemoryAccessFromGEP.push_back(isPtrInTerminator); + + SmallVector ShouldNotTrace; + + const SpecialCase isCmpInst = [](const Instruction *Inst, const Value *V) { + const CmpInst *CI = dyn_cast(Inst); + return CI; + }; + + ShouldNotTrace.push_back(isCmpInst); + + bool AllUsesValid = true; + auto I = V->use_begin(); + auto E = V->use_end(); + for (; I != E; I++) { + Instruction *Inst = dyn_cast(I->getUser()); + // Avoid infinite loops by not exploring already encountered instructions + if (std::find(Visited.begin(), Visited.end(), Inst) != Visited.end()) + continue; + Visited.push_back(Inst); + bool InterestingCase = false; + + for (auto &F : UnableToTrace) { + if (F(Inst, V)) { + InterestingCase = true; + AllUsesValid = false; + } + } + + for (auto &F : ShouldNotTrace) { + if (F(Inst, V)) + InterestingCase = true; + } + + for (auto &F : MemoryAccessFromGEP) { + // If we don't already have a reason to exit and this use accesses memory + // using the GEP address, check the legality + if (AllUsesValid && F(Inst, V)) { + InterestingCase = true; + unsigned AddrSpace = V->getType()->getPointerAddressSpace(); + TargetTransformInfo &TTI = GetTTI(*Inst->getFunction()); + bool IsValid = TTI.isLegalAddressingMode( + V->getType(), + /*BaseGV=*/nullptr, AccumulativeByteOffset, + /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace); + AllUsesValid &= IsValid; + } + } + + if (PtrToIntInst *PTI = dyn_cast(Inst)) { + InterestingCase = true; + AllUsesValid &= resolvePtrToInt(PTI, AccumulativeByteOffset, Visited); + } + + if (!InterestingCase) + AllUsesValid &= + traceAndCheckGEPUses(Inst, AccumulativeByteOffset, Visited); + + if (!AllUsesValid) + break; + } + return AllUsesValid; +} + void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) { IRBuilder<> Builder(Variadic); @@ -994,6 +1191,11 @@ AddrSpace)) { return Changed; } + + SmallVector UseChain; + if (!traceAndCheckGEPUses(GEP, AccumulativeByteOffset, UseChain)) { + return Changed; + } } // Remove the constant offset in each sequential index. The resultant GEP diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll @@ -0,0 +1,256 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -O3 < %s | FileCheck %s + +declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* nocapture, double) #8 + +define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, double addrspace(1)* %b, double %c) { +; CHECK-LABEL: IllegalGEPConst: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: s_add_i32 s0, s2, -1 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = addrspacecast double addrspace(1)* %i.3 to double* + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.4, double %c) #23 + ret void +} + +define protected amdgpu_kernel void @MixedGEP(i32 %a, double addrspace(1)* %b, double %c, double* %d) { +; CHECK-LABEL: MixedGEP: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: s_add_i32 s0, s8, -1 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + br label %bb1 + +bb1: + %i.7 = ptrtoint double addrspace(1)* %i.3 to i64 + %i.8 = add nsw i64 %i.7, 1 + %i.9 = inttoptr i64 %i.8 to double addrspace(1)* + %i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double * %d, double %c) #23 + %i.11 = addrspacecast double addrspace(1)* %i.9 to double* + %i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.11, double %c) #23 + ret void +} + + +declare double @foo(double addrspace(1) *) + +define protected amdgpu_kernel void @GEPAsFnArg(i32 %a, double addrspace(1)* %b) { +; CHECK-LABEL: GEPAsFnArg: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CHECK-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CHECK-NEXT: s_mov_b32 s38, -1 +; CHECK-NEXT: s_mov_b32 s39, 0xe00000 +; CHECK-NEXT: s_add_u32 s36, s36, s11 +; CHECK-NEXT: s_mov_b32 s14, s10 +; CHECK-NEXT: s_mov_b32 s12, s8 +; CHECK-NEXT: s_mov_b64 s[10:11], s[6:7] +; CHECK-NEXT: s_load_dword s8, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c +; CHECK-NEXT: s_addc_u32 s37, s37, 0 +; CHECK-NEXT: s_mov_b32 s13, s9 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_i32 s8, s8, -1 +; CHECK-NEXT: s_ashr_i32 s9, s8, 31 +; CHECK-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; CHECK-NEXT: s_add_u32 s15, s6, s8 +; CHECK-NEXT: s_addc_u32 s18, s7, s9 +; CHECK-NEXT: s_add_u32 s8, s4, 52 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1] +; CHECK-NEXT: s_mov_b64 s[6:7], s[2:3] +; CHECK-NEXT: s_mov_b64 s[0:1], s[36:37] +; CHECK-NEXT: s_mov_b64 s[2:3], s[38:39] +; CHECK-NEXT: v_mov_b32_e32 v0, s15 +; CHECK-NEXT: v_mov_b32_e32 v1, s18 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = call double @foo(double addrspace(1)* %i.3) + ret void +} + +declare double @bar(i64) + +define protected amdgpu_kernel void @GEPAsIntFnArg(i32 %a, double addrspace(1)* %b) { +; CHECK-LABEL: GEPAsIntFnArg: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CHECK-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CHECK-NEXT: s_mov_b32 s38, -1 +; CHECK-NEXT: s_mov_b32 s39, 0xe00000 +; CHECK-NEXT: s_add_u32 s36, s36, s11 +; CHECK-NEXT: s_mov_b32 s14, s10 +; CHECK-NEXT: s_mov_b32 s12, s8 +; CHECK-NEXT: s_mov_b64 s[10:11], s[6:7] +; CHECK-NEXT: s_load_dword s8, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c +; CHECK-NEXT: s_addc_u32 s37, s37, 0 +; CHECK-NEXT: s_mov_b32 s13, s9 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_i32 s8, s8, -1 +; CHECK-NEXT: s_ashr_i32 s9, s8, 31 +; CHECK-NEXT: s_lshl_b64 s[8:9], s[8:9], 3 +; CHECK-NEXT: s_add_u32 s6, s6, s8 +; CHECK-NEXT: s_addc_u32 s7, s7, s9 +; CHECK-NEXT: s_lshr_b32 s15, s7, 1 +; CHECK-NEXT: s_add_u32 s8, s4, 52 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, bar@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, bar@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v1, s6 +; CHECK-NEXT: v_alignbit_b32 v1, s7, v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1] +; CHECK-NEXT: s_mov_b64 s[6:7], s[2:3] +; CHECK-NEXT: s_mov_b64 s[0:1], s[36:37] +; CHECK-NEXT: s_mov_b64 s[2:3], s[38:39] +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, s15 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = ptrtoint double addrspace(1)* %i.3 to i64 + %i.5 = udiv i64 %i.4, 2 + %i.6 = call double @bar(i64 %i.5) + %i.9 = inttoptr i64 %i.5 to double addrspace(1)* + ret void +} + +define protected amdgpu_kernel void @IllegalGEPConstAsFptr(i32 %a, i8 addrspace(1)* %b, i64 %c) { +; CHECK-LABEL: IllegalGEPConstAsFptr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CHECK-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CHECK-NEXT: s_mov_b32 s38, -1 +; CHECK-NEXT: s_mov_b32 s39, 0xe00000 +; CHECK-NEXT: s_add_u32 s36, s36, s11 +; CHECK-NEXT: s_mov_b32 s14, s10 +; CHECK-NEXT: s_mov_b64 s[10:11], s[6:7] +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x2c +; CHECK-NEXT: s_addc_u32 s37, s37, 0 +; CHECK-NEXT: s_mov_b32 s12, s8 +; CHECK-NEXT: s_mov_b32 s13, s9 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_add_i32 s6, s6, -1 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 +; CHECK-NEXT: s_add_u32 s16, s16, s6 +; CHECK-NEXT: s_addc_u32 s17, s17, s7 +; CHECK-NEXT: s_add_u32 s8, s4, 60 +; CHECK-NEXT: s_addc_u32 s9, s5, 0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1] +; CHECK-NEXT: s_mov_b64 s[6:7], s[2:3] +; CHECK-NEXT: s_mov_b64 s[0:1], s[36:37] +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[38:39] +; CHECK-NEXT: v_mov_b32_e32 v0, s18 +; CHECK-NEXT: v_mov_b32_e32 v1, s19 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds i8, i8 addrspace(1)* %b, i64 %i.2 + %i.4 = addrspacecast i8 addrspace(1)* %i.3 to i8* + %fct_ptr = bitcast i8* %i.4 to i64 (i64 )* + %res = call i64 %fct_ptr(i64 %c) + ret void +} + +define protected amdgpu_kernel void @NoInfiniteGEPTracing(i32 %a, double addrspace(1)* %b, double %c) { +; CHECK-LABEL: NoInfiniteGEPTracing: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: s_add_u32 s0, s0, 8 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: .LBB5_1: ; %bb0 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %bb1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, 1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = ptrtoint double addrspace(1)* %i.3 to i64 + br label %bb0 + +bb0: + %phi = phi double addrspace(1)* [ %i.3, %entry ], [ %i.9, %bb0 ] + %i.7 = ptrtoint double addrspace(1)* %phi to i64 + %i.8 = sub nsw i64 %i.7, 1 + %cmp2 = icmp eq i64 %i.8, 0 + %i.9 = inttoptr i64 %i.7 to double addrspace(1)* + br i1 %cmp2, label %bb1, label %bb0 + +bb1: + %i.10 = addrspacecast double addrspace(1)* %i.9 to double* + %i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.10, double %c) #23 + ret void +} + + +attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } +attributes #23 = { nounwind } +