diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4593,9 +4593,12 @@ AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } - if (N1.getOpcode() == ISD::SHL && + // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) + // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1)) + // TODO: We should sink the following into isKnownToBePowerOfTwo + // using a OrZero parameter analogous to our handling in ValueTracking. + if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { - // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) SDValue NegOne = DAG.getAllOnesConstant(DL, VT); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -361,41 +361,25 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_lshr_pow2a: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [4,4,4,4] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrld %xmm4, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrld %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psrld %xmm3, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: psrld %xmm1, %xmm5 -; SSE-NEXT: pextrd $1, %xmm5, %ecx -; SSE-NEXT: pextrd $1, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %ecx -; SSE-NEXT: movl %edx, %ecx -; SSE-NEXT: movd %xmm6, %esi -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %esi -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE-NEXT: pextrd $2, %xmm4, %ecx -; SSE-NEXT: pextrd $2, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %ecx -; SSE-NEXT: pinsrd $2, %edx, %xmm1 -; SSE-NEXT: pextrd $3, %xmm2, %ecx -; SSE-NEXT: pextrd $3, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %ecx -; SSE-NEXT: pinsrd $3, %edx, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,4,4,4] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: psrld %xmm5, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE-NEXT: psrld %xmm1, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a: @@ -403,61 +387,28 @@ ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4] ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %ecx -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: vmovd %xmm5, %esi -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %esi -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $2, %xmm4, %ecx -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %ecx -; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $3, %xmm2, %ecx -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %ecx -; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_urem_by_lshr_pow2a: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm1, %ecx -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %ecx -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %esi -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %ecx -; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %ecx -; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = lshr <4 x i32> , %y %2 = urem <4 x i32> %x, %1 @@ -467,41 +418,25 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: combine_vec_urem_by_lshr_pow2b: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,8,16] -; SSE-NEXT: movdqa %xmm5, %xmm2 -; SSE-NEXT: psrld %xmm4, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm4 -; SSE-NEXT: psrld %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psrld %xmm3, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7] -; SSE-NEXT: psrld %xmm1, %xmm5 -; SSE-NEXT: pextrd $1, %xmm5, %ecx -; SSE-NEXT: pextrd $1, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %ecx -; SSE-NEXT: movl %edx, %ecx -; SSE-NEXT: movd %xmm6, %esi -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %esi -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE-NEXT: pextrd $2, %xmm4, %ecx -; SSE-NEXT: pextrd $2, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %ecx -; SSE-NEXT: pinsrd $2, %edx, %xmm1 -; SSE-NEXT: pextrd $3, %xmm2, %ecx -; SSE-NEXT: pextrd $3, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %ecx -; SSE-NEXT: pinsrd $3, %edx, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,8,16] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: psrld %xmm5, %xmm6 +; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrld %xmm1, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] +; SSE-NEXT: psrld %xmm1, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b: @@ -509,61 +444,28 @@ ; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16] ; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpextrd $1, %xmm1, %ecx -; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %ecx -; AVX1-NEXT: movl %edx, %ecx -; AVX1-NEXT: vmovd %xmm5, %esi -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %esi -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $2, %xmm4, %ecx -; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %ecx -; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 -; AVX1-NEXT: vpextrd $3, %xmm2, %ecx -; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: divl %ecx -; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16] ; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpextrd $1, %xmm1, %ecx -; AVX2-NEXT: vpextrd $1, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %ecx -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: vmovd %xmm1, %esi -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %esi -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $2, %xmm1, %ecx -; AVX2-NEXT: vpextrd $2, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %ecx -; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 -; AVX2-NEXT: vpextrd $3, %xmm1, %ecx -; AVX2-NEXT: vpextrd $3, %xmm0, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: divl %ecx -; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = lshr <4 x i32> , %y %2 = urem <4 x i32> %x, %1