Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -132,6 +132,8 @@ unsigned getMaxInterleaveFactor(unsigned VF); + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -288,6 +288,32 @@ return 8; } +bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + switch (Inst->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + auto *Ordering = dyn_cast(Inst->getArgOperand(2)); + auto *Volatile = dyn_cast(Inst->getArgOperand(4)); + if (!Ordering || !Volatile) + return false; // Invalid. + + unsigned OrderingVal = Ordering->getZExtValue(); + if (OrderingVal > static_cast(AtomicOrdering::SequentiallyConsistent)) + return false; + + Info.PtrVal = Inst->getArgOperand(0); + Info.Ordering = static_cast(OrderingVal); + Info.ReadMem = true; + Info.WriteMem = true; + Info.IsVolatile = !Volatile->isNullValue(); + return true; + } + default: + return false; + } +} + int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -777,7 +777,8 @@ /// Returns true if the specified instruction is using the specified value as an /// address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { +static bool isAddressUse(const TargetTransformInfo &TTI, + Instruction *Inst, Value *OperandVal) { bool isAddress = isa(Inst); if (StoreInst *SI = dyn_cast(Inst)) { if (SI->getPointerOperand() == OperandVal) @@ -786,18 +787,24 @@ // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::memset: - case Intrinsic::prefetch: - if (II->getArgOperand(0) == OperandVal) - isAddress = true; - break; - case Intrinsic::memmove: - case Intrinsic::memcpy: - if (II->getArgOperand(0) == OperandVal || - II->getArgOperand(1) == OperandVal) + case Intrinsic::memset: + case Intrinsic::prefetch: + if (II->getArgOperand(0) == OperandVal) + isAddress = true; + break; + case Intrinsic::memmove: + case Intrinsic::memcpy: + if (II->getArgOperand(0) == OperandVal || + II->getArgOperand(1) == OperandVal) + isAddress = true; + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo)) { + if (IntrInfo.PtrVal == OperandVal) isAddress = true; - break; + } + } } } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { if (RMW->getPointerOperand() == OperandVal) @@ -810,7 +817,8 @@ } /// Return the type of the memory being accessed. -static MemAccessTy getAccessType(const Instruction *Inst) { +static MemAccessTy getAccessType(const TargetTransformInfo &TTI, + Instruction *Inst) { MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); if (const StoreInst *SI = dyn_cast(Inst)) { AccessTy.MemTy = SI->getOperand(0)->getType(); @@ -821,6 +829,21 @@ AccessTy.AddrSpace = RMW->getPointerAddressSpace(); } else if (const AtomicCmpXchgInst *CmpX = dyn_cast(Inst)) { AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); + } else if (IntrinsicInst *II = dyn_cast(Inst)) { + switch (II->getIntrinsicID()) { + case Intrinsic::prefetch: + AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace(); + break; + default: { + MemIntrinsicInfo IntrInfo; + if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) { + AccessTy.AddrSpace + = IntrInfo.PtrVal->getType()->getPointerAddressSpace(); + } + + break; + } + } } // All pointers have the same requirements, so canonicalize them to an @@ -1025,7 +1048,7 @@ ScalarEvolution &SE, DominatorTree &DT, SmallPtrSetImpl *LoserRegs); }; - + /// An operand value in an instruction which is to be replaced with some /// equivalent, possibly strength-reduced, replacement. struct LSRFixup { @@ -1149,7 +1172,7 @@ if (f.Offset < MinOffset) MinOffset = f.Offset; } - + bool HasFormulaWithSameRegs(const Formula &F) const; float getNotSelectedProbability(const SCEV *Reg) const; bool InsertFormula(const Formula &F, const Loop &L); @@ -2362,7 +2385,7 @@ C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - MemAccessTy AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(TTI, UI->getUser()); int64_t Scale = C->getSExtValue(); if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, /*BaseOffset=*/0, @@ -3032,13 +3055,13 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI) { const SCEVConstant *IncConst = dyn_cast(IncExpr); - if (!IncConst || !isAddressUse(UserInst, Operand)) + if (!IncConst || !isAddressUse(TTI, UserInst, Operand)) return false; if (IncConst->getAPInt().getMinSignedBits() > 64) return false; - MemAccessTy AccessTy = getAccessType(UserInst); + MemAccessTy AccessTy = getAccessType(TTI, UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, IncOffset, /*HaseBaseReg=*/false)) @@ -3165,14 +3188,14 @@ LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; - if (isAddressUse(UserInst, U.getOperandValToReplace())) { + if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) { Kind = LSRUse::Address; - AccessTy = getAccessType(UserInst); + AccessTy = getAccessType(TTI, UserInst); } const SCEV *S = IU.getExpr(U); PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops(); - + // Equality (== and !=) ICmps are special. We can rewrite (i == N) as // (N - i == 0), and this allows (N - i) to be the expression that we work // with rather than just N or i, so we can consider the register @@ -4304,7 +4327,7 @@ LUThatHas->pushFixup(Fixup); DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); } - + // Delete formulae from the new use which are no longer legal. bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -407,6 +407,19 @@ ret void } +; GCN-LABEL: {{^}}nocse_lds_atomic_inc_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 { + %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + + store i32 %result0, i32 addrspace(1)* %out0 + store i32 %result1, i32 addrspace(1)* %out1 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } attributes #2 = { nounwind argmemonly } Index: test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll =================================================================== --- test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll +++ test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll @@ -84,4 +84,84 @@ br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph } -attributes #0 = { nounwind } \ No newline at end of file +; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32( +; OPT-NOT: getelementptr + +; OPT: .lr.ph: +; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 +; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef, i32 0, i32 0, i1 false) +; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false) +define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i32 %indvars.iv, 16383 + %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 + %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false) + %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv + %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false) + %tmp8 = add nsw i32 %tmp7, %tmp4 + atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32( +; OPT-NOT: getelementptr + +; OPT: .lr.ph: +; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ] +; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383 +; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef, i32 0, i32 0, i1 false) +; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false) +define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i32 %indvars.iv, 16383 + %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1 + %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false) + %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv + %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false) + %tmp8 = add nsw i32 %tmp7, %tmp4 + atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind argmemonly }