Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -91,6 +91,8 @@ unsigned getMaxInterleaveFactor(unsigned VF); + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -163,6 +163,26 @@ return 64; } +bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + switch (Inst->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + Info.ReadMem = true; + Info.WriteMem = true; + + // FIXME: Should this be false since atomic? EarlyCSE asserts if this is + // ever set. + Info.IsSimple = true; + Info.NumMemRefs = 1; + Info.PtrVal = Inst->getArgOperand(0); + return true; + } + default: + return false; + } +} + int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -766,7 +766,8 @@ /// Returns true if the specified instruction is using the specified value as an /// address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { +static bool isAddressUse(const TargetTransformInfo &TTI, + Instruction *Inst, Value *OperandVal) { bool isAddress = isa(Inst); if (StoreInst *SI = dyn_cast(Inst)) { if (SI->getPointerOperand() == OperandVal) @@ -775,11 +776,17 @@ // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - if (II->getArgOperand(0) == OperandVal) + case Intrinsic::prefetch: + if (II->getArgOperand(0) == OperandVal) + isAddress = true; + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo)) { + if (IntrInfo.PtrVal == OperandVal) isAddress = true; - break; + } + } } } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { if (RMW->getPointerOperand() == OperandVal) @@ -803,6 +810,16 @@ AccessTy.AddrSpace = RMW->getPointerAddressSpace(); } else if (const AtomicCmpXchgInst *CmpX = dyn_cast(Inst)) { AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); + } else if (const IntrinsicInst *II = dyn_cast(Inst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::prefetch: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); + break; + default: + break; + } } // All pointers have the same requirements, so canonicalize them to an @@ -2996,7 +3013,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast(IncExpr); - if (!IncConst || !isAddressUse(UserInst, Operand)) + if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) return false; if (IncConst->getAPInt().getMinSignedBits() > 64) @@ -3129,7 +3146,7 @@ LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; - if (isAddressUse(UserInst, U.getOperandValToReplace())) { + if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { Kind = LSRUse::Address; AccessTy = getAccessType(UserInst); } Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -307,15 +307,6 @@ ret void } -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind argmemonly } - - - - - - ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64: ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} @@ -381,3 +372,20 @@ %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42) ret void } + +; GCN-LABEL: {{^}}nocse_lds_atomic_inc_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +define void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 { + %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) + %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) + + store i32 %result0, i32 addrspace(1)* %out0 + store i32 %result1, i32 addrspace(1)* %out1 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind argmemonly } Index: test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll @@ -84,4 +84,84 @@ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } -attributes #0 = { nounwind } \ No newline at end of file +; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32( +; OPT-NOT: getelementptr + +; OPT: .lr.ph: +; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 +; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef) +; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef) +define void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i32 %indvars.iv, 16383 + %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 + %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef) + %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv + %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef) + %tmp8 = add nsw i32 %tmp7, %tmp4 + atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32( +; OPT-NOT: getelementptr + +; OPT: .lr.ph: +; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 +; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef) +; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef) +define void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i32 %indvars.iv, 16383 + %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 + %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef) + %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv + %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef) + %tmp8 = add nsw i32 %tmp7, %tmp4 + atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly }