Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3076,12 +3076,15 @@ EVT VT = N->getValueType(0); SDLoc DL(N); + unsigned Opc = N->getOpcode(); + bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc); + // X / undef -> undef // X % undef -> undef // X / 0 -> undef // X % 0 -> undef // NOTE: This includes vectors where any divisor element is zero/undef. - if (DAG.isUndef(N->getOpcode(), {N0, N1})) + if (DAG.isUndef(Opc, {N0, N1})) return DAG.getUNDEF(VT); // undef / X -> 0 @@ -3089,6 +3092,20 @@ if (N0.isUndef()) return DAG.getConstant(0, DL, VT); + // TODO: 0 / X -> 0 + // TODO: 0 % X -> 0 + + // X / X -> 1 + // X % X -> 0 + if (N0 == N1) + return DAG.getConstant(IsDiv ? 1 : 0, DL, VT); + + // TODO: X / 1 -> X + // TODO: X % 1 -> 0 + // If this is a boolean op (single-bit element type), we can't have + // division-by-zero or remainder-by-zero, so assume the divisor is 1. + // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1. + return SDValue(); } Index: test/CodeGen/MSP430/libcalls.ll =================================================================== --- test/CodeGen/MSP430/libcalls.ll +++ test/CodeGen/MSP430/libcalls.ll @@ -433,9 +433,10 @@ ; CHECK: call #__mspabi_divi %0 = load volatile i16, i16* @g_i16, align 8 - %1 = sdiv i16 %0, %0 + %1 = load volatile i16, i16* @g_i16, align 8 + %2 = sdiv i16 %0, %1 - ret i16 %1 + ret i16 %2 } define i32 @divli() #0 { @@ -444,9 +445,10 @@ ; CHECK: call #__mspabi_divli %0 = load volatile i32, i32* @g_i32, align 8 - %1 = sdiv i32 %0, %0 + %1 = load volatile i32, i32* @g_i32, align 8 + %2 = sdiv i32 %0, %1 - ret i32 %1 + ret i32 %2 } define i64 @divlli() #0 { @@ -455,9 +457,10 @@ ; CHECK: call #__mspabi_divlli %0 = load volatile i64, i64* @g_i64, align 8 - %1 = sdiv i64 %0, %0 + %1 = load volatile i64, i64* @g_i64, align 8 + %2 = sdiv i64 %0, %1 - ret i64 %1 + ret i64 %2 } define i16 @divu() #0 { @@ -466,9 +469,10 @@ ; CHECK: call #__mspabi_divu %0 = load volatile i16, i16* @g_i16, align 8 - %1 = udiv i16 %0, %0 + %1 = load volatile i16, i16* @g_i16, align 8 + %2 = udiv i16 %0, %1 - ret i16 %1 + ret i16 %2 } define i32 @divul() #0 { @@ -477,9 +481,10 @@ ; CHECK: call #__mspabi_divul %0 = load volatile i32, i32* @g_i32, align 8 - %1 = udiv i32 %0, %0 + %1 = load volatile i32, i32* @g_i32, align 8 + %2 = udiv i32 %0, %1 - ret i32 %1 + ret i32 %2 } define i64 @divull() #0 { @@ -488,9 +493,10 @@ ; CHECK: call #__mspabi_divull %0 = load volatile i64, i64* @g_i64, align 8 - %1 = udiv i64 %0, %0 + %1 = load volatile i64, i64* @g_i64, align 8 + %2 = udiv i64 %0, %1 - ret i64 %1 + ret i64 %2 } define i16 @remi() #0 { @@ -499,9 +505,10 @@ ; CHECK: call #__mspabi_remi %0 = load volatile i16, i16* @g_i16, align 8 - %1 = srem i16 %0, %0 + %1 = load volatile i16, i16* @g_i16, align 8 + %2 = srem i16 %0, %1 - ret i16 %1 + ret i16 %2 } define i32 @remli() #0 { @@ -510,9 +517,10 @@ ; CHECK: call #__mspabi_remli %0 = load volatile i32, i32* @g_i32, align 8 - %1 = srem i32 %0, %0 + %1 = load volatile i32, i32* @g_i32, align 8 + %2 = srem i32 %0, %1 - ret i32 %1 + ret i32 %2 } define i64 @remlli() #0 { @@ -521,9 +529,10 @@ ; CHECK: call #__mspabi_remlli %0 = load volatile i64, i64* @g_i64, align 8 - %1 = srem i64 %0, %0 + %1 = load volatile i64, i64* @g_i64, align 8 + %2 = srem i64 %0, %1 - ret i64 %1 + ret i64 %2 } define i16 @remu() #0 { @@ -532,9 +541,10 @@ ; CHECK: call #__mspabi_remu %0 = load volatile i16, i16* @g_i16, align 8 - %1 = urem i16 %0, %0 + %1 = load volatile i16, i16* @g_i16, align 8 + %2 = urem i16 %0, %1 - ret i16 %1 + ret i16 %2 } define i32 @remul() #0 { @@ -543,9 +553,10 @@ ; CHECK: call #__mspabi_remul %0 = load volatile i32, i32* @g_i32, align 8 - %1 = urem i32 %0, %0 + %1 = load volatile i32, i32* @g_i32, align 8 + %2 = urem i32 %0, %1 - ret i32 %1 + ret i32 %2 } define i64 @remull() #0 { @@ -554,9 +565,10 @@ ; CHECK: call #__mspabi_remull %0 = load volatile i64, i64* @g_i64, align 8 - %1 = urem i64 %0, %0 + %1 = load volatile i64, i64* @g_i64, align 8 + %2 = urem i64 %0, %1 - ret i64 %1 + ret i64 %2 } define i16 @mpyi() #0 { Index: test/CodeGen/SystemZ/pr32372.ll =================================================================== --- test/CodeGen/SystemZ/pr32372.ll +++ test/CodeGen/SystemZ/pr32372.ll @@ -4,17 +4,18 @@ define void @pr32372(i8*) { ; CHECK-LABEL: pr32372: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: llc %r1, 0(%r2) +; CHECK-NEXT: llc %r0, 0(%r2) ; CHECK-NEXT: mvhhi 0(%r1), -3825 -; CHECK-NEXT: llill %r0, 0 -; CHECK-NEXT: dlr %r0, %r1 +; CHECK-NEXT: llc %r3, 0(%r2) +; CHECK-NEXT: llill %r2, 0 +; CHECK-NEXT: dlr %r2, %r0 ; CHECK-NEXT: .LBB0_1: # %CF251 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: j .LBB0_1 BB: - %L = load i8, i8* %0 + %L = load volatile i8, i8* %0 store i16 -3825, i16* undef - %L5 = load i8, i8* %0 + %L5 = load volatile i8, i8* %0 %B9 = urem i8 %L5, %L %I107 = insertelement <8 x i8> zeroinitializer, i8 %B9, i32 7 %ZE141 = zext i8 %L5 to i16 Index: test/CodeGen/X86/2006-11-17-IllegalMove.ll =================================================================== --- test/CodeGen/X86/2006-11-17-IllegalMove.ll +++ test/CodeGen/X86/2006-11-17-IllegalMove.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: movb 0, %al ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: # kill: def $eax killed $eax def $ax -; CHECK-NEXT: divb %al +; CHECK-NEXT: divb 0 ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: cmpq %rax, %rax ; CHECK-NEXT: .LBB0_2: # %bb84 @@ -26,7 +26,7 @@ bb77: ; preds = %entry, %entry %tmp99 = udiv i64 0, 0 ; [#uses=1] - %tmp = load i8, i8* null ; [#uses=1] + %tmp = load volatile i8, i8* null ; [#uses=1] %tmp114 = icmp eq i64 0, 0 ; [#uses=1] br label %cond_true115 @@ -34,7 +34,7 @@ ret void cond_true115: ; preds = %bb77 - %tmp118 = load i8, i8* null ; [#uses=1] + %tmp118 = load volatile i8, i8* null ; [#uses=1] br label %cond_true120 cond_true120: ; preds = %cond_true115 Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -173,13 +173,11 @@ ret <4 x i32> %1 } -; TODO fold (sdiv x, x) -> 1 +; fold (sdiv x, x) -> 1 define i32 @combine_sdiv_dupe(i32 %x) { ; CHECK-LABEL: combine_sdiv_dupe: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %edi +; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: retq %1 = sdiv i32 %x, %x ret i32 %1 @@ -188,54 +186,23 @@ define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { ; SSE-LABEL: combine_vec_sdiv_dupe: ; SSE: # %bb.0: -; SSE-NEXT: pextrd $1, %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %ecx -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: movd %xmm0, %esi -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %esi -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE-NEXT: pextrd $2, %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %ecx -; SSE-NEXT: pinsrd $2, %eax, %xmm1 -; SSE-NEXT: pextrd $3, %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %ecx -; SSE-NEXT: pinsrd $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_sdiv_dupe: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: vmovd %xmm0, %esi -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %esi -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_sdiv_dupe: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX2ORLATER-NEXT: retq +; +; XOP-LABEL: combine_vec_sdiv_dupe: +; XOP: # %bb.0: +; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: retq %1 = sdiv <4 x i32> %x, %x ret <4 x i32> %1 } Index: test/CodeGen/X86/combine-srem.ll =================================================================== --- test/CodeGen/X86/combine-srem.ll +++ test/CodeGen/X86/combine-srem.ll @@ -168,14 +168,11 @@ ret <4 x i32> %1 } -; TODO fold (srem x, x) -> 0 +; fold (srem x, x) -> 0 define i32 @combine_srem_dupe(i32 %x) { ; CHECK-LABEL: combine_srem_dupe: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: cltd -; CHECK-NEXT: idivl %edi -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq %1 = srem i32 %x, %x ret i32 %1 @@ -184,53 +181,12 @@ define <4 x i32> @combine_vec_srem_dupe(<4 x i32> %x) { ; SSE-LABEL: combine_vec_srem_dupe: ; SSE: # %bb.0: -; SSE-NEXT: pextrd $1, %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %ecx -; SSE-NEXT: movl %edx, %ecx -; SSE-NEXT: movd %xmm0, %esi -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %esi -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE-NEXT: pextrd $2, %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %ecx -; SSE-NEXT: pinsrd $2, %edx, %xmm1 -; SSE-NEXT: pextrd $3, %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax -; SSE-NEXT: cltd -; SSE-NEXT: idivl %ecx -; SSE-NEXT: pinsrd $3, %edx, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_srem_dupe: ; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: vmovd %xmm0, %esi -; AVX-NEXT: movl %esi, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %esi -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %ecx -; AVX-NEXT: movl %ecx, %eax -; AVX-NEXT: cltd -; AVX-NEXT: idivl %ecx -; AVX-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i32> %x, %x ret <4 x i32> %1 @@ -473,25 +429,33 @@ ; CHECK-LABEL: ossfuzz6883: ; CHECK: # %bb.0: ; CHECK-NEXT: movl (%rax), %ecx -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: cltd +; CHECK-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: idivl %ecx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: movl $1, %edi +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: divl %ecx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %edi +; CHECK-NEXT: movl %edx, %esi +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: cltd +; CHECK-NEXT: idivl %esi ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: divl %edi -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: divl %esi +; CHECK-NEXT: andl %edi, %eax ; CHECK-NEXT: retq %B17 = or i32 0, 2147483647 %L6 = load i32, i32* undef - %B11 = sdiv i32 %L6, %L6 - %B13 = udiv i32 %B17, %B17 + %B11 = sdiv i32 %B17, %L6 + %B13 = udiv i32 %B17, %L6 %B14 = srem i32 %B11, %B13 - %B16 = srem i32 %L6, %L6 + %B16 = srem i32 %L6, %B14 %B10 = udiv i32 %L6, %B14 %B6 = and i32 %B16, %B10 ret i32 %B6 Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -143,13 +143,11 @@ ret <4 x i32> %1 } -; TODO fold (udiv x, x) -> 1 +; fold (udiv x, x) -> 1 define i32 @combine_udiv_dupe(i32 %x) { ; CHECK-LABEL: combine_udiv_dupe: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: divl %edi +; CHECK-NEXT: movl $1, %eax ; CHECK-NEXT: retq %1 = udiv i32 %x, %x ret i32 %1 @@ -158,46 +156,18 @@ define <4 x i32> @combine_vec_udiv_dupe(<4 x i32> %x) { ; SSE-LABEL: combine_vec_udiv_dupe: ; SSE: # %bb.0: -; SSE-NEXT: pextrd $1, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE-NEXT: pextrd $2, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: pinsrd $2, %eax, %xmm1 -; SSE-NEXT: pextrd $3, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: pinsrd $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_udiv_dupe: -; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_vec_udiv_dupe: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_vec_udiv_dupe: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX2-NEXT: retq %1 = udiv <4 x i32> %x, %x ret <4 x i32> %1 } Index: test/CodeGen/X86/combine-urem.ll =================================================================== --- test/CodeGen/X86/combine-urem.ll +++ test/CodeGen/X86/combine-urem.ll @@ -157,14 +157,11 @@ ret <4 x i32> %1 } -; TODO fold (urem x, x) -> 0 +; fold (urem x, x) -> 0 define i32 @combine_urem_dupe(i32 %x) { ; CHECK-LABEL: combine_urem_dupe: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: divl %edi -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq %1 = urem i32 %x, %x ret i32 %1 @@ -173,45 +170,12 @@ define <4 x i32> @combine_vec_urem_dupe(<4 x i32> %x) { ; SSE-LABEL: combine_vec_urem_dupe: ; SSE: # %bb.0: -; SSE-NEXT: pextrd $1, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: movl %edx, %ecx -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE-NEXT: pextrd $2, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: pinsrd $2, %edx, %xmm1 -; SSE-NEXT: pextrd $3, %xmm0, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: divl %eax -; SSE-NEXT: pinsrd $3, %edx, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_urem_dupe: ; AVX: # %bb.0: -; AVX-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: movl %edx, %ecx -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NEXT: xorl %edx, %edx -; AVX-NEXT: divl %eax -; AVX-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i32> %x, %x ret <4 x i32> %1 Index: test/CodeGen/X86/known-bits.ll =================================================================== --- test/CodeGen/X86/known-bits.ll +++ test/CodeGen/X86/known-bits.ll @@ -9,34 +9,46 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi +; X32-NEXT: subl $16, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movzbl (%eax), %eax -; X32-NEXT: imull $101, %eax, %eax +; X32-NEXT: movzbl (%eax), %ecx +; X32-NEXT: imull $101, %ecx, %eax ; X32-NEXT: shrl $14, %eax +; X32-NEXT: imull $177, %ecx, %ecx +; X32-NEXT: shrl $14, %ecx ; X32-NEXT: movzbl %al, %eax -; X32-NEXT: vmovd %eax, %xmm0 -; X32-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-NEXT: vpextrd $1, %xmm0, %ebp +; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 +; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-NEXT: vpextrd $1, %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: xorl %ecx, %ecx -; X32-NEXT: vmovd %xmm0, %esi -; X32-NEXT: vpextrd $2, %xmm0, %edi -; X32-NEXT: vpextrd $3, %xmm0, %ebx +; X32-NEXT: vmovd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: vmovd %xmm0, (%esp) # 4-byte Folded Spill +; X32-NEXT: vpextrd $2, %xmm1, %edi +; X32-NEXT: vpextrd $2, %xmm0, %esi +; X32-NEXT: vpextrd $3, %xmm1, %ebx +; X32-NEXT: vpextrd $3, %xmm0, %ebp ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB0_1: # %CF ; X32-NEXT: # =>This Loop Header: Depth=1 ; X32-NEXT: # Child Loop BB0_2 Depth 2 ; X32-NEXT: xorl %edx, %edx -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: divl %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: divl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X32-NEXT: xorl %edx, %edx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: divl %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: divl (%esp) # 4-byte Folded Reload ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: movl %edi, %eax -; X32-NEXT: divl %edi +; X32-NEXT: divl %esi ; X32-NEXT: xorl %edx, %edx ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: divl %ebx +; X32-NEXT: divl %ebp ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB0_2: # %CF237 ; X32-NEXT: # Parent Loop BB0_1 Depth=1 @@ -47,33 +59,46 @@ ; ; X64-LABEL: knownbits_zext_in_reg: ; X64: # %bb.0: # %BB +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %rbx ; X64-NEXT: movzbl (%rdi), %eax -; X64-NEXT: imull $101, %eax, %eax +; X64-NEXT: imull $101, %eax, %ecx +; X64-NEXT: shrl $14, %ecx +; X64-NEXT: imull $177, %eax, %eax ; X64-NEXT: shrl $14, %eax +; X64-NEXT: movzbl %cl, %ecx +; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 +; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: movzbl %al, %eax -; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpextrd $1, %xmm0, %r8d +; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X64-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpextrd $1, %xmm1, %r8d +; X64-NEXT: vpextrd $1, %xmm0, %r9d ; X64-NEXT: xorl %esi, %esi -; X64-NEXT: vmovd %xmm0, %r9d -; X64-NEXT: vpextrd $2, %xmm0, %edi -; X64-NEXT: vpextrd $3, %xmm0, %ecx +; X64-NEXT: vmovd %xmm1, %r10d +; X64-NEXT: vmovd %xmm0, %r11d +; X64-NEXT: vpextrd $2, %xmm1, %edi +; X64-NEXT: vpextrd $2, %xmm0, %ebx +; X64-NEXT: vpextrd $3, %xmm1, %ecx +; X64-NEXT: vpextrd $3, %xmm0, %ebp ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %CF ; X64-NEXT: # =>This Loop Header: Depth=1 ; X64-NEXT: # Child Loop BB0_2 Depth 2 ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: movl %r8d, %eax -; X64-NEXT: divl %r8d -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: movl %r9d, %eax ; X64-NEXT: divl %r9d ; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movl %r10d, %eax +; X64-NEXT: divl %r11d +; X64-NEXT: xorl %edx, %edx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: divl %edi +; X64-NEXT: divl %ebx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: movl %ecx, %eax -; X64-NEXT: divl %ecx +; X64-NEXT: divl %ebp ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_2: # %CF237 ; X64-NEXT: # Parent Loop BB0_1 Depth=1 @@ -85,11 +110,13 @@ %L5 = load i8, i8* %0 %Sl9 = select i1 true, i8 %L5, i8 undef %B21 = udiv i8 %Sl9, -93 + %B22 = udiv i8 %Sl9, 93 br label %CF CF: ; preds = %CF246, %BB %I40 = insertelement <4 x i8> zeroinitializer, i8 %B21, i32 1 - %B41 = srem <4 x i8> %I40, %I40 + %I41 = insertelement <4 x i8> zeroinitializer, i8 %B22, i32 1 + %B41 = srem <4 x i8> %I40, %I41 br label %CF237 CF237: ; preds = %CF237, %CF