Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4539,15 +4539,17 @@ } // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) // Variant of version done on multiply, except mul by a power of 2 is turned // into a shift. APInt Val; - if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && + if (N1C && (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && + N0.getNode()->hasOneUse() && (isa(N0.getOperand(1)) || isConstantSplatVector(N0.getOperand(1).getNode(), Val))) { SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1); + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); } // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll @@ -21,4 +21,27 @@ ret void } +; CHECK-LABEL: {{^}}ds_bpermute_add_shl: +; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 +; CHECK: s_waitcnt lgkmcnt +define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { + %index = add i32 %base_index, 1 + %byte_index = shl i32 %index, 2 + %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0 + store i32 %bpermute, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}ds_bpermute_or_shl: +; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 +; CHECK: s_waitcnt lgkmcnt +define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { + %masked = and i32 %base_index, 62 + %index = or i32 %masked, 1 + %byte_index = shl i32 %index, 2 + %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0 + store i32 %bpermute, i32 addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone convergent } Index: test/CodeGen/AMDGPU/shl.ll =================================================================== --- test/CodeGen/AMDGPU/shl.ll +++ test/CodeGen/AMDGPU/shl.ll @@ -377,4 +377,31 @@ ret void } +; FUNC-LABEL: {{^}}shl_or_k: +; SI: s_lshl_b32 [[SHL:s[0-9]+]], s{{[0-9]+}}, 2 +; SI: s_or_b32 [[OR:s[0-9]+]], [[SHL]], 4 +; SI: v_mov_b32_e32 [[VOR:v[0-9]+]], [[OR]] +; SI: buffer_store_dword [[VOR]] +define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) { + %tmp0 = or i32 %in, 1 + %tmp2 = shl i32 %tmp0, 2 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}shl_or_k_two_uses: +; SI: s_or_b32 [[OR:s[0-9]+]], s{{[0-9]+}}, 1 +; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[OR]], 2 +; SI-DAG: v_mov_b32_e32 [[VOR:v[0-9]+]], [[OR]] +; SI-DAG: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]] +; SI-DAG: buffer_store_dword [[VOR]] +; SI-DAG: buffer_store_dword [[VSHL]] +define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) { + %tmp0 = or i32 %in, 1 + %tmp2 = shl i32 %tmp0, 2 + store i32 %tmp2, i32 addrspace(1)* %out0 + store i32 %tmp0, i32 addrspace(1)* %out1 + ret void +} + attributes #0 = { nounwind readnone }