Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5595,16 +5595,18 @@ } // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) // Variant of version done on multiply, except mul by a power of 2 is turned // into a shift. - if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && + if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && + N0.getNode()->hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); AddToWorklist(Shl0.getNode()); AddToWorklist(Shl1.getNode()); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1); + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); } // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) Index: llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -70,13 +70,13 @@ ; FIXME: single bit op ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16: -; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; CI: v_or_b32_e32 [[OR:v[0-9]+]], [[MASK]], v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[OR]] -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SHL]] +; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}} +; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]] +; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]] +; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], ; CIVI: flat_store_dword ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} @@ -88,15 +88,14 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; CI: v_or_b32_e32 [[OR00:v[0-9]+]], [[MASK]], v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, [[OR00]] -; CI: v_or_b32_e32 [[OR01:v[0-9]+]], v{{[0-9]+}}, [[SHL0]] -; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR01]] -; CI: v_or_b32_e32 [[OR10:v[0-9]+]], [[MASK]], v{{[0-9]+}} -; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, [[OR10]] -; CI: v_or_b32_e32 [[OR11:v[0-9]+]], v{{[0-9]+}}, [[SHL1]] -; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR11]] +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 +; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}} +; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]] +; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}} +; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]] +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]] +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]] +; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll @@ -27,4 +27,27 @@ ret void } +; CHECK-LABEL: {{^}}ds_bpermute_add_shl: +; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 +; CHECK: s_waitcnt lgkmcnt +define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { + %index = add i32 %base_index, 1 + %byte_index = shl i32 %index, 2 + %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0 + store i32 %bpermute, i32 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}ds_bpermute_or_shl: +; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4 +; CHECK: s_waitcnt lgkmcnt +define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind { + %masked = and i32 %base_index, 62 + %index = or i32 %masked, 1 + %byte_index = shl i32 %index, 2 + %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0 + store i32 %bpermute, i32 addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone convergent } Index: llvm/trunk/test/CodeGen/AMDGPU/shl.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shl.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shl.ll @@ -476,4 +476,28 @@ ret void } +; FUNC-LABEL: {{^}}shl_or_k: +; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} +; SI: v_or_b32_e32 [[OR:v[0-9]+]], 4, [[SHL]] +; SI: buffer_store_dword [[OR]] +define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) { + %tmp0 = or i32 %in, 1 + %tmp2 = shl i32 %tmp0, 2 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}shl_or_k_two_uses: +; SI: v_or_b32_e32 [[OR:v[0-9]+]], 1, v{{[0-9]+}} +; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, [[OR]] +; SI-DAG: buffer_store_dword [[OR]] +; SI-DAG: buffer_store_dword [[SHL]] +define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) { + %tmp0 = or i32 %in, 1 + %tmp2 = shl i32 %tmp0, 2 + store i32 %tmp2, i32 addrspace(1)* %out0 + store i32 %tmp0, i32 addrspace(1)* %out1 + ret void +} + attributes #0 = { nounwind readnone } Index: llvm/trunk/test/CodeGen/X86/combine-shl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/combine-shl.ll +++ llvm/trunk/test/CodeGen/X86/combine-shl.ll @@ -537,19 +537,19 @@ ret <4 x i32> %2 } -; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) +; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) { ; SSE-LABEL: combine_vec_shl_or0: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: pslld $2, %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_or0: ; AVX: # BB#0: -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = or <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -559,14 +559,14 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) { ; SSE-LABEL: combine_vec_shl_or1: ; SSE: # BB#0: -; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE-NEXT: por {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_or1: ; AVX: # BB#0: -; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = or <4 x i32> %x, %2 = shl <4 x i32> %1,