diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9550,6 +9550,8 @@ } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) +// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no +// bits // This is a variant of // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), @@ -9572,7 +9574,7 @@ // We only do this to handle cases where it's profitable when there are // multiple uses of the add, so defer to the standard combine. - if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || + if ((N0.getOpcode() != ISD::ADD && N0->getOpcode() != ISD::OR) || N0->hasOneUse()) return SDValue(); @@ -9584,8 +9586,14 @@ if (!CAdd) return SDValue(); - // If the resulting offset is too large, we can't fold it into the addressing - // mode offset. + SelectionDAG &DAG = DCI.DAG; + + if (N0->getOpcode() == ISD::OR && + !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) + return SDValue(); + + // If the resulting offset is too large, we can't fold it into the + // addressing mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); @@ -9595,7 +9603,6 @@ if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) return SDValue(); - SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); EVT VT = N->getValueType(0); diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -410,15 +410,12 @@ ret void } -; FIXME: This or should fold into an offset on the write ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds: -; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 -; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]] -; GCN: v_lshlrev_b32_e32 [[SCALE2:v[0-9]+]], 4, v0 -; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} -; GCN: ds_write_b32 [[SCALE2]], v{{[0-9]+}} offset:64 +; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8 +; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 { - %idx.add = or i32 %idx, 4 + %idx.shl = shl i32 %idx, 1 + %idx.add = or i32 %idx.shl, 1 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3) @@ -427,15 +424,13 @@ store volatile i32 10, ptr addrspace(3) %ptr1 ret void } - -; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_max_lds_offset: +; GCN-LABEL: {{^}}shl_or_ptr_not_combine_2use_lds: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 -; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528 -; GCN-DAG: v_or_b32_e32 [[ADD1:v[0-9]+]], 0x1fff0, [[SCALE1]] -; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}} -define void @shl_or_ptr_combine_2use_max_lds_offset(i32 %idx) #0 { - %idx.add = or i32 %idx, 8191 +; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} +; GCN-DAG: ds_write_b32 [[SCALE1]], v{{[0-9]+}} +define void @shl_or_ptr_not_combine_2use_lds(i32 %idx) #0 { + %idx.add = or i32 %idx, 1 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)