Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1974,9 +1974,11 @@ break; } - // We can always fold X == X for integer setcc's. - if (N1 == N2 && OpVT.isInteger()) - return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT); + // icmp X, X -> true/false + // icmp X, undef -> true/false because undef could be X. + if (OpVT.isInteger()) + if (N1 == N2 || N1.isUndef() || N2.isUndef()) + return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT); if (ConstantSDNode *N2C = dyn_cast(N2)) { const APInt &C2 = N2C->getAPIntValue(); Index: test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -172,9 +172,7 @@ define <4 x i32> @test_urem_div_undef(<4 x i32> %X) nounwind readnone { ; CHECK-LABEL: test_urem_div_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -207,9 +205,7 @@ define <4 x i32> @test_urem_both_undef(<4 x i32> %X) nounwind readnone { ; CHECK-LABEL: test_urem_both_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, Index: test/CodeGen/SPARC/missinglabel.ll =================================================================== --- test/CodeGen/SPARC/missinglabel.ll +++ test/CodeGen/SPARC/missinglabel.ll @@ -4,7 +4,7 @@ target datalayout = "E-m:e-i64:64-n32:64-S128" target triple = "sparc64-unknown-linux-gnu" -define void @f() align 2 { +define void @f(i64 %a0) align 2 { ; CHECK-LABEL: f: ; CHECK: .cfi_startproc ; CHECK-NEXT: ! %bb.0: ! %entry @@ -22,7 +22,7 @@ ; CHECK-NEXT: .LBB0_1: ! %cond.false ; CHECK-NEXT: .LBB0_4: ! %exit.i85 entry: - %cmp = icmp eq i64 undef, 0 + %cmp = icmp eq i64 %a0, 0 br i1 %cmp, label %targetblock, label %cond.false cond.false: Index: test/CodeGen/SystemZ/buildvector-00.ll =================================================================== --- test/CodeGen/SystemZ/buildvector-00.ll +++ test/CodeGen/SystemZ/buildvector-00.ll @@ -4,12 +4,12 @@ ; Test that the dag combiner can understand that some vector operands are ; all-zeros and then optimize the logical operations. -define void @f1() { +define void @f1(<2 x i64> %a0) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: vlrepg %v0, 0(%r1) ; CHECK-NEXT: vgbm %v1, 0 -; CHECK-NEXT: vceqg %v2, %v0, %v1 +; CHECK-NEXT: vceqg %v2, %v24, %v1 ; CHECK-NEXT: vn %v0, %v0, %v0 ; CHECK-NEXT: vno %v2, %v2, %v2 ; CHECK-NEXT: vceqg %v0, %v0, %v1 @@ -26,13 +26,13 @@ bb1: ; preds = %bb %tmp2 = load i64, i64* undef, align 8 %tmp3 = insertelement <2 x i64> undef, i64 %tmp2, i32 1 - %tmp4 = icmp ne <2 x i64> undef, zeroinitializer + %tmp4 = icmp ne <2 x i64> %a0, zeroinitializer %tmp5 = xor <2 x i1> %tmp4, zeroinitializer %tmp6 = xor <2 x i1> zeroinitializer, %tmp5 %tmp7 = and <2 x i64> %tmp3, %tmp %tmp8 = icmp ne <2 x i64> %tmp7, zeroinitializer %tmp9 = xor <2 x i1> zeroinitializer, %tmp8 - %tmp10 = icmp ne <2 x i64> undef, zeroinitializer + %tmp10 = icmp ne <2 x i64> %a0, zeroinitializer %tmp11 = xor <2 x i1> %tmp10, %tmp9 %tmp12 = and <2 x i1> %tmp6, %tmp11 %tmp13 = extractelement <2 x i1> %tmp12, i32 0 Index: test/CodeGen/SystemZ/dag-combine-03.ll =================================================================== --- test/CodeGen/SystemZ/dag-combine-03.ll +++ test/CodeGen/SystemZ/dag-combine-03.ll @@ -10,7 +10,7 @@ ; handled so that the AND is removed. If this succeeds, this results in a CHI ; instead of TMLL. -define void @fun() { +define void @fun(i64 %a0) { ; CHECK-LABEL: fun: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lghi %r1, 0 @@ -18,13 +18,13 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: la %r0, 2(%r1) ; CHECK-NEXT: la %r1, 1(%r1) -; CHECK-NEXT: cgr %r1, %r0 -; CHECK-NEXT: lhi %r2, 0 -; CHECK-NEXT: lochie %r2, 1 -; CHECK-NEXT: cgr %r0, %r0 +; CHECK-NEXT: cgr %r1, %r2 +; CHECK-NEXT: lhi %r3, 0 +; CHECK-NEXT: lochie %r3, 1 +; CHECK-NEXT: cgr %r0, %r2 ; CHECK-NEXT: lhi %r0, 0 ; CHECK-NEXT: lochie %r0, 1 -; CHECK-NEXT: vlvgp %v0, %r2, %r2 +; CHECK-NEXT: vlvgp %v0, %r3, %r3 ; CHECK-NEXT: vlvgp %v1, %r0, %r0 ; CHECK-NEXT: vx %v0, %v0, %v1 ; CHECK-NEXT: vlgvf %r0, %v0, 1 @@ -38,8 +38,8 @@ %phi = phi i64 [ %sel, %lab0 ], [ 0, %entry ] %add = add nuw nsw i64 %phi, 1 %add2 = add nuw nsw i64 %phi, 2 - %cmp = icmp eq i64 %add, undef - %cmp2 = icmp eq i64 %add2, undef + %cmp = icmp eq i64 %add, %a0 + %cmp2 = icmp eq i64 %add2, %a0 %ins = insertelement <2 x i1> undef, i1 %cmp, i32 0 %ins2 = insertelement <2 x i1> undef, i1 %cmp2, i32 0 %xor = xor <2 x i1> %ins, %ins2 Index: test/CodeGen/X86/2006-11-17-IllegalMove.ll =================================================================== --- test/CodeGen/X86/2006-11-17-IllegalMove.ll +++ test/CodeGen/X86/2006-11-17-IllegalMove.ll @@ -10,11 +10,9 @@ ; CHECK-NEXT: ja .LBB0_2 ; CHECK-NEXT: # %bb.1: # %bb77 ; CHECK-NEXT: movb 0, %al -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: # kill: def $eax killed $eax def $ax -; CHECK-NEXT: divb 0 -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: cmpq %rax, %rax +; CHECK-NEXT: movb 0, %al +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: .LBB0_2: # %bb84 ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/hoist-spill.ll =================================================================== --- test/CodeGen/X86/hoist-spill.ll +++ test/CodeGen/X86/hoist-spill.ll @@ -14,7 +14,7 @@ @d = external global i32*, align 8 ; Function Attrs: norecurse noreturn nounwind uwtable -define void @fn1(i32 %p1) { +define void @fn1(i32 %p1, i32 %p2, i64 %p3) { entry: %tmp = load i32*, i32** @d, align 8 %tmp1 = load i32*, i32** @a, align 8 @@ -26,10 +26,10 @@ %indvars.iv30.in = phi i32 [ %indvars.iv30, %for.inc14 ], [ %p1, %entry ] %c.0 = phi i32 [ %inc15, %for.inc14 ], [ 1, %entry ] %k.0 = phi i32 [ %k.1.lcssa, %for.inc14 ], [ undef, %entry ] - %tmp3 = icmp sgt i32 undef, 0 + %tmp3 = icmp sgt i32 %p2, 0 %smax52 = select i1 %tmp3, i32 %c.0, i32 0 %tmp4 = zext i32 %smax52 to i64 - %tmp5 = icmp sgt i64 undef, %tmp4 + %tmp5 = icmp sgt i64 %p3, %tmp4 %smax53 = select i1 %tmp5, i64 %tmp2, i64 %tmp4 %tmp6 = add nsw i64 %smax53, 1 %tmp7 = sub nsw i64 %tmp6, %tmp4 Index: test/CodeGen/X86/select.ll =================================================================== --- test/CodeGen/X86/select.ll +++ test/CodeGen/X86/select.ll @@ -1261,106 +1261,6 @@ ret void } -define void @test19() { -; This is a massive reduction of an llvm-stress test case that generates -; interesting chains feeding setcc and eventually a f32 select operation. This -; is intended to exercise the SELECT formation in the DAG combine simplifying -; a simplified select_cc node. If it it regresses and is no longer triggering -; that code path, it can be deleted. -; -; CHECK-LABEL: test19: -; CHECK: ## %bb.0: ## %BB -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: movb $1, %cl -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB23_1: ## %CF -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %cl, %cl -; CHECK-NEXT: jne LBB23_1 -; CHECK-NEXT: ## %bb.2: ## %CF250 -; CHECK-NEXT: ## in Loop: Header=BB23_1 Depth=1 -; CHECK-NEXT: jne LBB23_1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: LBB23_3: ## %CF242 -; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cmpl %eax, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: jp LBB23_3 -; CHECK-NEXT: ## %bb.4: ## %CF244 -; CHECK-NEXT: retq -; -; ATHLON-LABEL: test19: -; ATHLON: ## %bb.0: ## %BB -; ATHLON-NEXT: movb $1, %al -; ATHLON-NEXT: .p2align 4, 0x90 -; ATHLON-NEXT: LBB23_1: ## %CF -; ATHLON-NEXT: ## =>This Inner Loop Header: Depth=1 -; ATHLON-NEXT: testb %al, %al -; ATHLON-NEXT: jne LBB23_1 -; ATHLON-NEXT: ## %bb.2: ## %CF250 -; ATHLON-NEXT: ## in Loop: Header=BB23_1 Depth=1 -; ATHLON-NEXT: jne LBB23_1 -; ATHLON-NEXT: ## %bb.3: ## %CF242.preheader -; ATHLON-NEXT: fldz -; ATHLON-NEXT: .p2align 4, 0x90 -; ATHLON-NEXT: LBB23_4: ## %CF242 -; ATHLON-NEXT: ## =>This Inner Loop Header: Depth=1 -; ATHLON-NEXT: fucomi %st(0), %st -; ATHLON-NEXT: jp LBB23_4 -; ATHLON-NEXT: ## %bb.5: ## %CF244 -; ATHLON-NEXT: fstp %st(0) -; ATHLON-NEXT: retl -; -; MCU-LABEL: test19: -; MCU: # %bb.0: # %BB -; MCU-NEXT: movl $-1, %ecx -; MCU-NEXT: movb $1, %al -; MCU-NEXT: .p2align 4, 0x90 -; MCU-NEXT: .LBB23_1: # %CF -; MCU-NEXT: # =>This Inner Loop Header: Depth=1 -; MCU-NEXT: testb %al, %al -; MCU-NEXT: jne .LBB23_1 -; MCU-NEXT: # %bb.2: # %CF250 -; MCU-NEXT: # in Loop: Header=BB23_1 Depth=1 -; MCU-NEXT: jne .LBB23_1 -; MCU-NEXT: # %bb.3: # %CF242.preheader -; MCU-NEXT: fldz -; MCU-NEXT: .p2align 4, 0x90 -; MCU-NEXT: .LBB23_4: # %CF242 -; MCU-NEXT: # =>This Inner Loop Header: Depth=1 -; MCU-NEXT: cmpl %eax, %ecx -; MCU-NEXT: fucom %st(0) -; MCU-NEXT: fnstsw %ax -; MCU-NEXT: # kill: def $ah killed $ah killed $ax -; MCU-NEXT: sahf -; MCU-NEXT: jp .LBB23_4 -; MCU-NEXT: # %bb.5: # %CF244 -; MCU-NEXT: fstp %st(0) -; MCU-NEXT: retl -BB: - br label %CF - -CF: - %Cmp10 = icmp ule i8 undef, undef - br i1 %Cmp10, label %CF, label %CF250 - -CF250: - %E12 = extractelement <4 x i32> , i32 2 - %Cmp32 = icmp ugt i1 %Cmp10, false - br i1 %Cmp32, label %CF, label %CF242 - -CF242: - %Cmp38 = icmp uge i32 %E12, undef - %FC = uitofp i1 %Cmp38 to float - %Sl59 = select i1 %Cmp32, float %FC, float undef - %Cmp60 = fcmp ugt float undef, undef - br i1 %Cmp60, label %CF242, label %CF244 - -CF244: - %B122 = fadd float %Sl59, undef - ret void -} - define i16 @select_xor_1(i16 %A, i8 %cond) { ; CHECK-LABEL: select_xor_1: ; CHECK: ## %bb.0: ## %entry @@ -1422,10 +1322,10 @@ ; MCU-LABEL: select_xor_1b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %dl -; MCU-NEXT: je .LBB25_2 +; MCU-NEXT: je .LBB24_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: xorl $43, %eax -; MCU-NEXT: .LBB25_2: # %entry +; MCU-NEXT: .LBB24_2: # %entry ; MCU-NEXT: # kill: def $ax killed $ax killed $eax ; MCU-NEXT: retl entry: @@ -1492,10 +1392,10 @@ ; MCU-LABEL: select_xor_2b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %cl -; MCU-NEXT: je .LBB27_2 +; MCU-NEXT: je .LBB26_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: xorl %edx, %eax -; MCU-NEXT: .LBB27_2: # %entry +; MCU-NEXT: .LBB26_2: # %entry ; MCU-NEXT: retl entry: %and = and i8 %cond, 1 @@ -1561,10 +1461,10 @@ ; MCU-LABEL: select_or_b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %cl -; MCU-NEXT: je .LBB29_2 +; MCU-NEXT: je .LBB28_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: orl %edx, %eax -; MCU-NEXT: .LBB29_2: # %entry +; MCU-NEXT: .LBB28_2: # %entry ; MCU-NEXT: retl entry: %and = and i8 %cond, 1 @@ -1630,10 +1530,10 @@ ; MCU-LABEL: select_or_1b: ; MCU: # %bb.0: # %entry ; MCU-NEXT: testb $1, %cl -; MCU-NEXT: je .LBB31_2 +; MCU-NEXT: je .LBB30_2 ; MCU-NEXT: # %bb.1: ; MCU-NEXT: orl %edx, %eax -; MCU-NEXT: .LBB31_2: # %entry +; MCU-NEXT: .LBB30_2: # %entry ; MCU-NEXT: retl entry: %and = and i32 %cond, 1 Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll =================================================================== --- test/CodeGen/X86/tail-dup-merge-loop-headers.ll +++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -91,116 +91,107 @@ ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $1, %ebx +; CHECK-NEXT: movl $1, %r14d ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB1_26 +; CHECK-NEXT: jne .LBB1_15 ; CHECK-NEXT: # %bb.1: # %if.end19 ; CHECK-NEXT: movl %esi, %ebp ; CHECK-NEXT: movq %rdi, %r12 -; CHECK-NEXT: movl (%rax), %r14d -; CHECK-NEXT: leal (,%r14,4), %r13d +; CHECK-NEXT: movl (%rax), %ebx +; CHECK-NEXT: leal (,%rbx,4), %r13d ; CHECK-NEXT: movl %r13d, %r15d ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: movq %r15, %rdi ; CHECK-NEXT: callq cli_calloc ; CHECK-NEXT: testl %ebp, %ebp -; CHECK-NEXT: je .LBB1_25 +; CHECK-NEXT: je .LBB1_15 ; CHECK-NEXT: # %bb.2: # %if.end19 -; CHECK-NEXT: testl %r14d, %r14d -; CHECK-NEXT: je .LBB1_25 +; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: je .LBB1_15 ; CHECK-NEXT: # %bb.3: # %if.end19 -; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB1_25 +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: jne .LBB1_15 ; CHECK-NEXT: # %bb.4: # %if.end19 ; CHECK-NEXT: cmpq %r12, %rax -; CHECK-NEXT: jb .LBB1_25 +; CHECK-NEXT: jb .LBB1_15 ; CHECK-NEXT: # %bb.5: # %if.end50 -; CHECK-NEXT: movq %rbx, %rdi +; CHECK-NEXT: movq %rax, %rdi ; CHECK-NEXT: movq %r15, %rdx ; CHECK-NEXT: callq memcpy ; CHECK-NEXT: cmpl $4, %r13d -; CHECK-NEXT: jb .LBB1_28 +; CHECK-NEXT: jb .LBB1_23 ; CHECK-NEXT: # %bb.6: # %shared_preheader -; CHECK-NEXT: movb $32, %dl +; CHECK-NEXT: movb $32, %sil ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: # implicit-def: $rcx -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: je .LBB1_18 +; CHECK-NEXT: movb $1, %cl +; CHECK-NEXT: # implicit-def: $rdx +; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: jne .LBB1_11 +; CHECK-NEXT: jmp .LBB1_8 +; CHECK-NEXT: .LBB1_21: # %outer_loop_latch +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 +; CHECK-NEXT: movzwl %di, %edi +; CHECK-NEXT: decl %edi +; CHECK-NEXT: movzwl %di, %edi +; CHECK-NEXT: leaq 1(%rdx,%rdi), %rdx +; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: je .LBB1_8 ; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB1_8: # %shared_loop_header +; CHECK-NEXT: .LBB1_11: # %shared_loop_header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: cmpq %rbx, %rax -; CHECK-NEXT: jb .LBB1_27 -; CHECK-NEXT: # %bb.9: # %inner_loop_body -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: jns .LBB1_8 -; CHECK-NEXT: # %bb.10: # %if.end96.i -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: cmpl $3, %eax -; CHECK-NEXT: jae .LBB1_22 -; CHECK-NEXT: # %bb.11: # %if.end287.i -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: setne %dl ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB1_15 -; CHECK-NEXT: # %bb.12: # %if.end308.i -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 +; CHECK-NEXT: jne .LBB1_22 +; CHECK-NEXT: # %bb.12: # %inner_loop_body +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: je .LBB1_17 -; CHECK-NEXT: # %bb.13: # %if.end335.i -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: testb %dl, %dl -; CHECK-NEXT: movl $0, %esi -; CHECK-NEXT: jne .LBB1_7 -; CHECK-NEXT: # %bb.14: # %merge_other -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: jmp .LBB1_16 -; CHECK-NEXT: .LBB1_15: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: movb %dl, %sil -; CHECK-NEXT: addl $3, %esi -; CHECK-NEXT: .LBB1_16: # %outer_loop_latch -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: # implicit-def: $dl -; CHECK-NEXT: jmp .LBB1_7 -; CHECK-NEXT: .LBB1_17: # %merge_predecessor_split -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: movb $32, %dl +; CHECK-NEXT: je .LBB1_11 +; CHECK-NEXT: # %bb.13: # %if.end96.i +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: je .LBB1_14 +; CHECK-NEXT: # %bb.16: # %if.end287.i +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 +; CHECK-NEXT: movw $3, %di +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: # implicit-def: $sil +; CHECK-NEXT: jne .LBB1_21 +; CHECK-NEXT: # %bb.17: # %if.end308.i +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: je .LBB1_18 +; CHECK-NEXT: # %bb.19: # %if.end335.i +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 ; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: .LBB1_7: # %outer_loop_latch -; CHECK-NEXT: # in Loop: Header=BB1_8 Depth=1 -; CHECK-NEXT: movzwl %si, %esi -; CHECK-NEXT: decl %esi -; CHECK-NEXT: movzwl %si, %esi -; CHECK-NEXT: leaq 1(%rcx,%rsi), %rcx -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: jne .LBB1_8 -; CHECK-NEXT: .LBB1_18: # %while.cond.us1412.i +; CHECK-NEXT: testb %sil, %sil +; CHECK-NEXT: movl $0, %edi +; CHECK-NEXT: jne .LBB1_21 +; CHECK-NEXT: # %bb.20: # %merge_other +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: # implicit-def: $sil +; CHECK-NEXT: jmp .LBB1_21 +; CHECK-NEXT: .LBB1_18: # %merge_predecessor_split +; CHECK-NEXT: # in Loop: Header=BB1_11 Depth=1 +; CHECK-NEXT: movb $32, %sil +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: jmp .LBB1_21 +; CHECK-NEXT: .LBB1_8: # %while.cond.us1412.i ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: movl $1, %ebx -; CHECK-NEXT: jne .LBB1_20 -; CHECK-NEXT: # %bb.19: # %while.cond.us1412.i -; CHECK-NEXT: decb %dl -; CHECK-NEXT: jne .LBB1_26 -; CHECK-NEXT: .LBB1_20: # %if.end41.us1436.i -; CHECK-NEXT: .LBB1_25: -; CHECK-NEXT: movl $1, %ebx -; CHECK-NEXT: jmp .LBB1_26 -; CHECK-NEXT: .LBB1_22: # %if.then99.i -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: jne .LBB1_10 +; CHECK-NEXT: # %bb.9: # %while.cond.us1412.i +; CHECK-NEXT: decb %sil +; CHECK-NEXT: jne .LBB1_15 +; CHECK-NEXT: .LBB1_10: # %if.end41.us1436.i +; CHECK-NEXT: .LBB1_14: # %if.then99.i +; CHECK-NEXT: xorl %r14d, %r14d ; CHECK-NEXT: movl $.str.6, %edi ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: callq cli_dbgmsg -; CHECK-NEXT: .LBB1_26: # %cleanup -; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: .LBB1_15: # %cleanup +; CHECK-NEXT: movl %r14d, %eax ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -209,8 +200,8 @@ ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB1_27: # %wunpsect.exit.thread.loopexit389 -; CHECK-NEXT: .LBB1_28: # %wunpsect.exit.thread.loopexit391 +; CHECK-NEXT: .LBB1_22: # %wunpsect.exit.thread.loopexit389 +; CHECK-NEXT: .LBB1_23: # %wunpsect.exit.thread.loopexit391 entry: %0 = load i32, i32* undef, align 4 %mul = shl nsw i32 %0, 2 @@ -223,7 +214,7 @@ %notrhs = icmp eq i32 %0, 0 %or.cond117.not = or i1 %1, %notrhs %or.cond202 = or i1 %or.cond117.not, undef - %cmp35 = icmp ult i8* undef, %exe + %cmp35 = icmp ult i8* %call, %exe %or.cond203 = or i1 %or.cond202, %cmp35 br i1 %or.cond203, label %cleanup, label %if.end50 @@ -238,7 +229,7 @@ outer_loop_header: ; preds = %outer_loop_latch, %shared_preheader %bits.1.i = phi i8 [ 32, %shared_preheader ], [ %bits.43.i, %outer_loop_latch ] %dst.0.ph.i = phi i8* [ undef, %shared_preheader ], [ %scevgep679.i, %outer_loop_latch ] - %2 = icmp eq i32 undef, 0 + %2 = icmp eq i32 %0, 0 br i1 %2, label %while.cond.us1412.i, label %shared_loop_header while.cond.us1412.i: ; preds = %outer_loop_header Index: test/CodeGen/X86/tail-dup-repeat.ll =================================================================== --- test/CodeGen/X86/tail-dup-repeat.ll +++ test/CodeGen/X86/tail-dup-repeat.ll @@ -7,7 +7,7 @@ ; and if.then64, and then the block dup2 gets duplicated into land.lhs.true ; and if.end70 -define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6) #0 align 2 { +define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6, i32 %a7) #0 align 2 { ; CHECK-LABEL: repeated_tail_dup: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: .p2align 4, 0x90 @@ -27,7 +27,7 @@ ; CHECK-NEXT: # %bb.4: # %if.then64 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movb $1, (%r8) -; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: testl %r9d, %r9d ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: jmp .LBB0_8 ; CHECK-NEXT: .p2align 4, 0x90 @@ -37,7 +37,7 @@ ; CHECK-NEXT: .LBB0_6: # %dup2 ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: movl $2, (%rcx) -; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: testl %r9d, %r9d ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: .LBB0_8: # %for.end ; CHECK-NEXT: retq @@ -68,7 +68,7 @@ dup1: ; preds = %dup2, %if.then64 %val = load i32, i32* %a4, align 8 - %switch = icmp ult i32 undef, 1 + %switch = icmp ult i32 %a7, 1 br i1 %switch, label %for.cond, label %for.end for.end: ; preds = %dup1 Index: test/CodeGen/X86/undef-ops.ll =================================================================== --- test/CodeGen/X86/undef-ops.ll +++ test/CodeGen/X86/undef-ops.ll @@ -450,8 +450,7 @@ define i1 @undef_operand_size_not_same_as_result() { ; CHECK-LABEL: undef_operand_size_not_same_as_result: ; CHECK: # %bb.0: -; CHECK-NEXT: testl %eax, %eax -; CHECK-NEXT: sete %al +; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: retq %sh = shl i32 7, undef %cmp = icmp eq i32 0, %sh Index: test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -683,17 +683,23 @@ define <4 x i32> @test_urem_div_undef(<4 x i32> %X) nounwind readnone { ; CHECK-SSE-LABEL: test_urem_div_undef: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 -; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_div_undef: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_div_undef: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_div_undef: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_div_undef: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -794,17 +800,23 @@ define <4 x i32> @test_urem_both_undef(<4 x i32> %X) nounwind readnone { ; CHECK-SSE-LABEL: test_urem_both_undef: ; CHECK-SSE: # %bb.0: -; CHECK-SSE-NEXT: pxor %xmm0, %xmm0 -; CHECK-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-SSE-NEXT: psrld $31, %xmm0 +; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX-LABEL: test_urem_both_undef: -; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-AVX-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX-NEXT: retq +; CHECK-AVX1-LABEL: test_urem_both_undef: +; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: retq +; +; CHECK-AVX2-LABEL: test_urem_both_undef: +; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512VL-LABEL: test_urem_both_undef: +; CHECK-AVX512VL: # %bb.0: +; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX512VL-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> Index: test/CodeGen/X86/vec_int_to_fp-widen.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp-widen.ll +++ test/CodeGen/X86/vec_int_to_fp-widen.ll @@ -1928,12 +1928,7 @@ ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VEX-NEXT: js .LBB39_8 -; VEX-NEXT: # %bb.7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; VEX-NEXT: .LBB39_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; @@ -2161,13 +2156,8 @@ ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB41_6: ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: js .LBB41_8 -; SSE2-NEXT: # %bb.7: ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE2-NEXT: .LBB41_8: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; SSE2-NEXT: retq ; @@ -2204,13 +2194,8 @@ ; SSE41-NEXT: addss %xmm0, %xmm0 ; SSE41-NEXT: .LBB41_6: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: js .LBB41_8 -; SSE41-NEXT: # %bb.7: ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE41-NEXT: .LBB41_8: ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; SSE41-NEXT: retq ; @@ -2245,12 +2230,7 @@ ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB41_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VEX-NEXT: js .LBB41_8 -; VEX-NEXT: # %bb.7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; VEX-NEXT: .LBB41_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -1928,12 +1928,7 @@ ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VEX-NEXT: js .LBB39_8 -; VEX-NEXT: # %bb.7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; VEX-NEXT: .LBB39_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; @@ -2161,13 +2156,8 @@ ; SSE2-NEXT: addss %xmm1, %xmm1 ; SSE2-NEXT: .LBB41_6: ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: js .LBB41_8 -; SSE2-NEXT: # %bb.7: ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE2-NEXT: .LBB41_8: ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; SSE2-NEXT: retq ; @@ -2204,13 +2194,8 @@ ; SSE41-NEXT: addss %xmm0, %xmm0 ; SSE41-NEXT: .LBB41_6: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: js .LBB41_8 -; SSE41-NEXT: # %bb.7: ; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE41-NEXT: .LBB41_8: ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; SSE41-NEXT: retq ; @@ -2245,12 +2230,7 @@ ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB41_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VEX-NEXT: js .LBB41_8 -; VEX-NEXT: # %bb.7: ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 -; VEX-NEXT: .LBB41_8: ; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; VEX-NEXT: retq ; Index: test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll @@ -1806,44 +1806,20 @@ ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,1,2,3,u,u,u,u> -; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = -; SSE41-NEXT: pmulhw %xmm1, %xmm3 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm2 -; SSE41-NEXT: psraw $1, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: constant_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpeqw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [844433520132096,844433520132096] -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm3 -; AVX2-NEXT: vpblendvb %xmm1, %xmm0, %xmm3, %xmm1 -; AVX2-NEXT: vpcmpeqw {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: constant_shift_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4,5,6,7] +; AVX-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i16: ; XOP: # %bb.0: Index: test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll +++ test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll @@ -1500,32 +1500,26 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; SSE2-LABEL: constant_shift_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpeqw {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pcmpeqw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = -; SSE41-NEXT: pmulhuw %xmm1, %xmm2 -; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; ; AVX-LABEL: constant_shift_v4i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; AVX-NEXT: retq ; ; XOP-LABEL: constant_shift_v4i16: @@ -1535,10 +1529,8 @@ ; ; AVX512DQ-LABEL: constant_shift_v4i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX512DQ-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v4i16: @@ -1552,10 +1544,8 @@ ; ; AVX512DQVL-LABEL: constant_shift_v4i16: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpcmpeqw {{.*}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: constant_shift_v4i16: @@ -1565,13 +1555,13 @@ ; ; X32-SSE-LABEL: constant_shift_v4i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = -; X32-SSE-NEXT: pmulhuw %xmm0, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pcmpeqw {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: pand %xmm2, %xmm1 +; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0 +; X32-SSE-NEXT: pandn %xmm0, %xmm2 +; X32-SSE-NEXT: por %xmm2, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <4 x i16> %a, ret <4 x i16> %shift