Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -870,6 +870,8 @@ setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); // FIXME: In other contexts we pretend this is a per-function property. setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); @@ -8563,14 +8565,28 @@ return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); } +/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset +/// by the chain and intrinsic ID. Theoretically we would also need to check the +/// specific intrinsic. +static unsigned getBasePtrIndex(const MemSDNode *N) { + switch (N->getOpcode()) { + case ISD::STORE: + case ISD::INTRINSIC_W_CHAIN: + case ISD::INTRINSIC_VOID: + return 2; + default: + return 1; + } +} + SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const { - // FIXME: getBasePtr does not work correctly for intrinsic nodes and will find - // the intrinsic ID, not the pointer. - SDValue Ptr = N->getBasePtr(); SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); + unsigned PtrIdx = getBasePtrIndex(N); + SDValue Ptr = N->getOperand(PtrIdx); + // TODO: We could also do this for multiplies. if (Ptr.getOpcode() == ISD::SHL) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), @@ -8578,7 +8594,7 @@ if (NewPtr) { SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + NewOps[PtrIdx] = NewPtr; return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } } Index: llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s + +; GCN-LABEL: {{^}}shl_base_atomicrmw_global_atomic_csub_ptr: +; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 43 +; GCN: v_add_co_u32_e64 v[[EXTRA_LO:[0-9]+]], vcc_lo, 0x80, v4 +; GCN: v_add_co_ci_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc_lo, 0, v5, vcc_lo +; GCN: global_atomic_csub v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 glc +; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} +define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 + %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 + %shl = shl i64 %cast, 2 + %castback = inttoptr i64 %shl to i32 addrspace(1)* + %val = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %castback, i32 43) + store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4 + ret i32 %val +} + +declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0 + +attributes #0 = { argmemonly nounwind } Index: llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll +++ llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll @@ -1,13 +1,13 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; GCN-LABEL: {{^}}shl_base_global_ptr: +; GCN-LABEL: {{^}}shl_base_atomicrmw_global_ptr: ; GCN: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 ; GCN: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc ; GCN: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 ; GCN: global_atomic_and v{{\[}}[[LO]]:[[HI]]{{\]}}, [[THREE]], off offset:512 ; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} -define void @shl_base_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { +define void @shl_base_atomicrmw_global_ptr(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 %shl = shl i64 %cast, 2 @@ -17,5 +17,24 @@ ret void } +; GCN-LABEL: {{^}}shl_base_global_ptr_global_atomic_fadd: +; GCN: v_add_co_u32_e32 v[[EXTRA_LO:[0-9]+]], vcc, 0x80, v4 +; GCN: v_addc_co_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc, 0, v5, vcc +; GCN: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5] +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x42c80000 +; GCN: global_atomic_add_f32 v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 +; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}} +define void @shl_base_global_ptr_global_atomic_fadd(i32 addrspace(1)* %out, i64 addrspace(1)* %extra.use, [512 x i32] addrspace(1)* %ptr) #0 { + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(1)* %ptr, i64 0, i64 32 + %cast = ptrtoint i32 addrspace(1)* %arrayidx0 to i64 + %shl = shl i64 %cast, 2 + %castback = inttoptr i64 %shl to float addrspace(1)* + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %castback, float 100.0) + store volatile i64 %cast, i64 addrspace(1)* %extra.use, align 4 + ret void +} + +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #1 + attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { argmemonly nounwind willreturn }