Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -91,6 +91,8 @@ unsigned getMaxInterleaveFactor(unsigned VF); + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -163,6 +163,33 @@ return 64; } +bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + switch (Inst->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + auto *Ordering = dyn_cast(Inst->getArgOperand(1)); + auto *Volatile = dyn_cast(Inst->getArgOperand(4)); + if (!Ordering || !Volatile) + return false; // Invalid. + + unsigned OrderingVal = Ordering->getZExtValue(); + if (OrderingVal > static_cast(AtomicOrdering::SequentiallyConsistent)) + return false; + + Info.PtrVal = Inst->getArgOperand(0); + Info.Ordering = static_cast(OrderingVal); + Info.NumMemRefs = 1; + Info.ReadMem = true; + Info.WriteMem = true; + Info.IsVolatile = !Volatile->isNullValue(); + return true; + } + default: + return false; + } +} + int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -766,7 +766,8 @@ /// Returns true if the specified instruction is using the specified value as an /// address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { +static bool isAddressUse(const TargetTransformInfo &TTI, + Instruction *Inst, Value *OperandVal) { bool isAddress = isa(Inst); if (StoreInst *SI = dyn_cast(Inst)) { if (SI->getPointerOperand() == OperandVal) @@ -775,11 +776,17 @@ // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - if (II->getArgOperand(0) == OperandVal) + case Intrinsic::prefetch: + if (II->getArgOperand(0) == OperandVal) + isAddress = true; + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo)) { + if (IntrInfo.PtrVal == OperandVal) isAddress = true; - break; + } + } } } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { if (RMW->getPointerOperand() == OperandVal) @@ -792,7 +799,8 @@ } /// Return the type of the memory being accessed. -static MemAccessTy getAccessType(const Instruction *Inst) { +static MemAccessTy getAccessType(const TargetTransformInfo &TTI, + Instruction *Inst) { MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); if (const StoreInst *SI = dyn_cast(Inst)) { AccessTy.MemTy = SI->getOperand(0)->getType(); @@ -803,6 +811,21 @@ AccessTy.AddrSpace = RMW->getPointerAddressSpace(); } else if (const AtomicCmpXchgInst *CmpX = dyn_cast(Inst)) { AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); + } else if (IntrinsicInst *II = dyn_cast(Inst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::prefetch: + AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { + AccessTy.AddrSpace + = IntrInfo.PtrVal->getType()->getPointerAddressSpace(); + } + + break; + } + } } // All pointers have the same requirements, so canonicalize them to an @@ -2326,7 +2349,7 @@ C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - MemAccessTy AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(TTI, UI->getUser()); int64_t Scale = C->getSExtValue(); if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, /*BaseOffset=*/0, @@ -2996,13 +3019,13 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast(IncExpr); - if (!IncConst || !isAddressUse(UserInst, Operand)) + if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) return false; if (IncConst->getAPInt().getMinSignedBits() > 64) return false; - MemAccessTy AccessTy = getAccessType(UserInst); + MemAccessTy AccessTy = getAccessType(TTI, UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, IncOffset, /*HaseBaseReg=*/false)) @@ -3129,9 +3152,9 @@ LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; - if (isAddressUse(UserInst, U.getOperandValToReplace())) { + if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { Kind = LSRUse::Address; - AccessTy = getAccessType(UserInst); + AccessTy = getAccessType(TTI, UserInst); } const SCEV *S = IU.getExpr(U); Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -373,6 +373,19 @@ ret void } +; GCN-LABEL: {{^}}nocse_lds_atomic_inc_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +define void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 { + %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + + store i32 %result0, i32 addrspace(1)* %out0 + store i32 %result1, i32 addrspace(1)* %out1 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind argmemonly } Index: test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll @@ -84,4 +84,84 @@ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } -attributes #0 = { nounwind } \ No newline at end of file +; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32( +; OPT-NOT: getelementptr + +; OPT: .lr.ph: +; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 +; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef) +; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef) +define void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i32 %indvars.iv, 16383 + %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 + %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef) + %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv + %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef) + %tmp8 = add nsw i32 %tmp7, %tmp4 + atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32( +; OPT-NOT: getelementptr + +; OPT: .lr.ph: +; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 +; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef) +; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef) +define void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i32 %indvars.iv, 16383 + %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 + %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef) + %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv + %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef) + %tmp8 = add nsw i32 %tmp7, %tmp4 + atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly }