Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1901,8 +1901,19 @@ return SDValue(); // Bail out if any constants are opaque because we can't constant fold those. + // The exception is "and" and "or" with either 0 or -1 in which case we can + // propagate non constant operands into select. I.e.: + // and (select Cond, 0, -1), X --> select Cond, 0, X + // or X, (select Cond, -1, 0) --> select Cond, -1, X + bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && + (isNullConstantOrNullSplatConstant(CT) || + isAllOnesConstantOrAllOnesSplatConstant(CT)) && + (isNullConstantOrNullSplatConstant(CF) || + isAllOnesConstantOrAllOnesSplatConstant(CF)); + SDValue CBO = BO->getOperand(SelOpNo ^ 1); - if (!isConstantOrConstantVector(CBO, true) && + if (!CanFoldNonConst && + !isConstantOrConstantVector(CBO, true) && !isConstantFPBuildVectorOrConstantFP(CBO)) return SDValue(); @@ -1923,14 +1934,14 @@ SDLoc DL(Sel); SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT) : DAG.getNode(BinOpcode, DL, VT, CT, CBO); - if (!NewCT.isUndef() && + if (!CanFoldNonConst && !NewCT.isUndef() && !isConstantOrConstantVector(NewCT, true) && !isConstantFPBuildVectorOrConstantFP(NewCT)) return SDValue(); SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) : DAG.getNode(BinOpcode, DL, VT, CF, CBO); - if (!NewCF.isUndef() && + if (!CanFoldNonConst && !NewCF.isUndef() && !isConstantOrConstantVector(NewCF, true) && !isConstantFPBuildVectorOrConstantFP(NewCF)) return SDValue(); Index: test/CodeGen/AMDGPU/dagcombine-select.ll =================================================================== --- test/CodeGen/AMDGPU/dagcombine-select.ll +++ test/CodeGen/AMDGPU/dagcombine-select.ll @@ -1,5 +1,107 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; GCN-LABEL: {{^}}select_and1: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN-NOT: v_and_b32 +; GCN: store_dword v[{{[0-9:]+}}], [[SEL]], +define amdgpu_kernel void @select_and1(i32 addrspace(1)* %p, i32 %x, i32 %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 0, i32 -1 + %a = and i32 %y, %s + store i32 %a, i32 addrspace(1)* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_and2: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN-NOT: v_and_b32 +; GCN: store_dword v[{{[0-9:]+}}], [[SEL]], +define amdgpu_kernel void @select_and2(i32 addrspace(1)* %p, i32 %x, i32 %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 0, i32 -1 + %a = and i32 %s, %y + store i32 %a, i32 addrspace(1)* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_and3: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN-NOT: v_and_b32 +; GCN: store_dword v[{{[0-9:]+}}], [[SEL]], +define amdgpu_kernel void @select_and3(i32 addrspace(1)* %p, i32 %x, i32 %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 -1, i32 0 + %a = and i32 %y, %s + store i32 %a, i32 addrspace(1)* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_and_v4: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, v{{[0-9]+}}, +; GCN-NOT: v_and_b32 +; GCN: store_dword +define amdgpu_kernel void @select_and_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> + %a = and <4 x i32> %s, %y + store <4 x i32> %a, <4 x i32> addrspace(1)* %p, align 32 + ret void +} + +; GCN-LABEL: {{^}}select_or1: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN-NOT: v_or_b32 +; GCN: store_dword v[{{[0-9:]+}}], [[SEL]], +define amdgpu_kernel void @select_or1(i32 addrspace(1)* %p, i32 %x, i32 %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 0, i32 -1 + %a = or i32 %y, %s + store i32 %a, i32 addrspace(1)* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_or2: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN-NOT: v_or_b32 +; GCN: store_dword v[{{[0-9:]+}}], [[SEL]], +define amdgpu_kernel void @select_or2(i32 addrspace(1)* %p, i32 %x, i32 %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 0, i32 -1 + %a = or i32 %s, %y + store i32 %a, i32 addrspace(1)* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_or3: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN-NOT: v_or_b32 +; GCN: store_dword v[{{[0-9:]+}}], [[SEL]], +define amdgpu_kernel void @select_or3(i32 addrspace(1)* %p, i32 %x, i32 %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 -1, i32 0 + %a = or i32 %y, %s + store i32 %a, i32 addrspace(1)* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_or_v4: +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], -1, v{{[0-9]+}}, +; GCN-NOT: v_or_b32 +; GCN: store_dword +define amdgpu_kernel void @select_or_v4(<4 x i32> addrspace(1)* %p, i32 %x, <4 x i32> %y) { + %c = icmp slt i32 %x, 11 + %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> + %a = or <4 x i32> %s, %y + store <4 x i32> %a, <4 x i32> addrspace(1)* %p, align 32 + ret void +} + ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants: ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9, define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspace(1)* %p, i1 %cond) { Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -31,25 +31,25 @@ ; SI-DAG: v_mul_hi_u32 [[RCP_HI:v[0-9]+]], [[RCP]] ; SI-DAG: v_mul_lo_i32 [[RCP_LO:v[0-9]+]], [[RCP]] ; SI-DAG: v_sub_{{[iu]}}32_e32 [[NEG_RCP_LO:v[0-9]+]], vcc, 0, [[RCP_LO]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[E:v[0-9]+]], {{v[0-9]+}}, [[RCP]] +; SI: v_cmp_eq_u32_e64 [[CC1:s\[[0-9:]+\]]], 0, [[RCP_HI]] +; SI: v_cndmask_b32_e64 [[CND1:v[0-9]+]], [[RCP_LO]], [[NEG_RCP_LO]], [[CC1]] +; SI: v_mul_hi_u32 [[E:v[0-9]+]], [[CND1]], [[RCP]] ; SI-DAG: v_add_{{[iu]}}32_e32 [[RCP_A_E:v[0-9]+]], vcc, [[E]], [[RCP]] ; SI-DAG: v_subrev_{{[iu]}}32_e32 [[RCP_S_E:v[0-9]+]], vcc, [[E]], [[RCP]] -; SI: v_cndmask_b32_e64 -; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]] -; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]] +; SI: v_cndmask_b32_e64 [[CND2:v[0-9]+]], [[RCP_S_E]], [[RCP_A_E]], [[CC1]] +; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]], [[CND2]], +; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]], [[CND2]] ; SI-DAG: v_add_{{[iu]}}32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]] ; SI-DAG: v_sub_{{[iu]}}32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]] ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_subrev_{{[iu]}}32_e32 [[Quotient_S_One:v[0-9]+]], ; SI-DAG: v_subrev_{{[iu]}}32_e32 [[Remainder_S_Den:v[0-9]+]], -; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]] ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_add_{{[iu]}}32_e32 [[Remainder_A_Den:v[0-9]+]], ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 +; SI-NOT: v_and_b32 ; SI: s_endpgm define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) { %result0 = udiv i32 %x, %y @@ -124,8 +124,6 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 ; SI-DAG: v_add_{{[iu]}}32_e32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 @@ -147,8 +145,6 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 ; SI-DAG: v_add_{{[iu]}}32_e32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 @@ -157,6 +153,7 @@ ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 +; SI-NOT: v_and_b32 ; SI: s_endpgm define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { %result0 = udiv <2 x i32> %x, %y @@ -274,8 +271,6 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 ; SI-DAG: v_add_{{[iu]}}32_e32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 @@ -297,8 +292,6 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 ; SI-DAG: v_add_{{[iu]}}32_e32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 @@ -320,8 +313,6 @@ ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 ; SI-DAG: v_add_{{[iu]}}32_e32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 @@ -339,6 +330,7 @@ ; SI-DAG: v_add_{{[iu]}}32_e32 ; SI-DAG: v_subrev_{{[iu]}}32_e32 ; SI-DAG: v_cndmask_b32_e64 +; SI-NOT: v_and_b32 ; SI: s_endpgm define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { %result0 = udiv <4 x i32> %x, %y Index: test/CodeGen/X86/dagcombine-select.ll =================================================================== --- test/CodeGen/X86/dagcombine-select.ll +++ test/CodeGen/X86/dagcombine-select.ll @@ -6,9 +6,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: setl %al -; CHECK-NEXT: decl %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: cmovgel %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -21,9 +19,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: setl %al -; CHECK-NEXT: decl %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: cmovgel %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -31,14 +27,42 @@ ret i32 %a } +define i32 @select_and3(i32 %x, i32 %y) { +; CHECK-LABEL: select_and3: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $11, %edi +; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: retq + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 -1, i32 0 + %a = and i32 %y, %s + ret i32 %a +} + +define <4 x i32> @select_and_v4(i32 %x, <4 x i32> %y) { +; CHECK-LABEL: select_and_v4: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpl $11, %edi +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: jl .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %c = icmp slt i32 %x, 11 + %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> + %a = and <4 x i32> %s, %y + ret <4 x i32> %a +} + define i32 @select_or1(i32 %x, i32 %y) { ; CHECK-LABEL: select_or1: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: setl %al -; CHECK-NEXT: decl %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: cmovll %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -49,11 +73,9 @@ define i32 @select_or2(i32 %x, i32 %y) { ; CHECK-LABEL: select_or2: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: setl %al -; CHECK-NEXT: decl %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: cmovll %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -61,6 +83,34 @@ ret i32 %a } +define i32 @select_or3(i32 %x, i32 %y) { +; CHECK-LABEL: select_or3: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpl $11, %edi +; CHECK-NEXT: movl $-1, %eax +; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: retq + %c = icmp slt i32 %x, 11 + %s = select i1 %c, i32 -1, i32 0 + %a = or i32 %y, %s + ret i32 %a +} + +define <4 x i32> @select_or_v4(i32 %x, <4 x i32> %y) { +; CHECK-LABEL: select_or_v4: +; CHECK: # %bb.0: +; CHECK-NEXT: cmpl $11, %edi +; CHECK-NEXT: jl .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: retq + %c = icmp slt i32 %x, 11 + %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> + %a = or <4 x i32> %s, %y + ret <4 x i32> %a +} + define i32 @sel_constants_sub_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: sel_constants_sub_constant_sel_constants: ; CHECK: # %bb.0: @@ -186,11 +236,11 @@ ; CHECK-LABEL: fsub_constant_sel_constants: ; CHECK: # %bb.0: ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB13_1 +; CHECK-NEXT: jne .LBB17_1 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB13_1: +; CHECK-NEXT: .LBB17_1: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq %sel = select i1 %cond, double -4.0, double 23.3 @@ -202,11 +252,11 @@ ; CHECK-LABEL: fdiv_constant_sel_constants: ; CHECK: # %bb.0: ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB14_1 +; CHECK-NEXT: jne .LBB18_1 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB14_1: +; CHECK-NEXT: .LBB18_1: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq %sel = select i1 %cond, double -4.0, double 23.3 @@ -218,11 +268,11 @@ ; CHECK-LABEL: frem_constant_sel_constants: ; CHECK: # %bb.0: ; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: jne .LBB15_1 +; CHECK-NEXT: jne .LBB19_1 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB15_1: +; CHECK-NEXT: .LBB19_1: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: retq %sel = select i1 %cond, double -4.0, double 23.3