Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -403,13 +403,14 @@ // This is the entry block. if (MBB->pred_empty()) { - for (const auto &LI : MBB->liveins()) { - for (int rx : regIndices(LI.PhysReg)) { - // Treat function live-ins as if they were defined just before the first - // instruction. Usually, function arguments are set up immediately - // before the call. - LiveRegs[rx].Def = -1; - } + // Treat all registers as being defined just before the first instruction. + // We used to only do this for live-ins, but that's a bit of a gamble. + // If our caller does arithmetic with these registers is is quite likely + // that it will have used registers beyond the ones that are live here. + // Given the immense penalty for getting this wrong, being conservative + // here seems worth it. + for (unsigned rx = 0; rx != NumRegs; ++rx) { + LiveRegs[rx].Def = -1; } DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); return; @@ -627,6 +628,11 @@ // issues, reset the counter. LiveRegs[rx].Def = -(1 << 20); } + } else if (MI->isCall()) { + // If this is a call, pretend all registers we are considering are def'd + // here. We have no idea which registers the callee may use. + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def = CurInstr; } ++CurInstr; } Index: test/CodeGen/X86/avx-cvt.ll =================================================================== --- test/CodeGen/X86/avx-cvt.ll +++ test/CodeGen/X86/avx-cvt.ll @@ -74,6 +74,7 @@ define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcA: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i64, i64* %e, align 8 @@ -84,6 +85,7 @@ define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcB: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i32, i32* %e, align 4 @@ -94,6 +96,7 @@ define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcC: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i32, i32* %e, align 4 @@ -104,6 +107,7 @@ define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcD: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i64, i64* %e, align 8 @@ -114,6 +118,7 @@ define void @fpext() nounwind uwtable { ; CHECK-LABEL: fpext: ; CHECK: # BB#0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtss2sd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 ; CHECK-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: retq @@ -144,5 +149,3 @@ ret float %res } declare float @llvm.floor.f32(float %p) - - Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -16,13 +16,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; KNL-NEXT: vmovq %xmm2, %rax ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -55,7 +56,8 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -79,7 +81,8 @@ ; KNL-LABEL: sltof2f32: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -100,9 +103,10 @@ ; KNL: ## BB#0: ; KNL-NEXT: vmovdqu (%rdi), %ymm0 ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax @@ -180,9 +184,10 @@ ; KNL-LABEL: sltof432: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax @@ -205,9 +210,10 @@ ; KNL-LABEL: ultof432: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax @@ -231,13 +237,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; KNL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 +; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3 +; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3 ; KNL-NEXT: vmovq %xmm2, %rax ; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] @@ -479,6 +486,7 @@ define double @funcA(i64* nocapture %e) { ; ALL-LABEL: funcA: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -490,6 +498,7 @@ define double @funcB(i32* %e) { ; ALL-LABEL: funcB: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -501,6 +510,7 @@ define float @funcC(i32* %e) { ; ALL-LABEL: funcC: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -512,6 +522,7 @@ define float @i64tof32(i64* %e) { ; ALL-LABEL: i64tof32: ; ALL: ## BB#0: ## %entry +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 ; ALL-NEXT: retq entry: @@ -720,6 +731,7 @@ define float @uitofp02(i32 %a) nounwind { ; ALL-LABEL: uitofp02: ; ALL: ## BB#0: +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to float @@ -729,6 +741,7 @@ define double @uitofp03(i32 %a) nounwind { ; ALL-LABEL: uitofp03: ; ALL: ## BB#0: +; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to double @@ -1122,6 +1135,7 @@ ; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; KNL-NEXT: vpextrq $1, %xmm0, %rax ; KNL-NEXT: andl $1, %eax +; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: andl $1, %eax Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -260,15 +260,26 @@ ; avoid a cyclic dependence on a write to the same register in a previous ; iteration, especially when we cannot zero out the undef register because it ; is alive. -define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { +define i64 @loopclearence(float %z, double %a, double %b, double %c, i64* nocapture %x, double* nocapture %y) nounwind { entry: %vx = load i64, i64* %x - br label %loop +;AVX-LABEL:@loopclearence +;AVX: vxorps [[XMM4_7:%xmm[4-7]]], [[XMM4_7]], [[XMM4_7]] +;AVX-NEXT: vucomiss [[XMM4_7]], %xmm0 + %0 = fcmp ult float %z, 0.0 + br i1 %0, label %loop, label %fake + loop: %i = phi i64 [ 1, %entry ], [ %inc, %loop ] %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + store volatile double %a, double *%y + store volatile double %b, double *%y + store volatile double %c, double *%y + ; AVX-NOT: {{%xmm[4-7]}} + ; This register was forced to have an xorps, above, therefore it should be simply re-used + ; AVX-NOT: vxorps + ; AVX: vcvtsi2sdq {{.*}}, [[XMM4_7]], {{%xmm[0-9]+}} %fi = sitofp i64 %i to double - tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() %vy = load double, double* %y @@ -280,23 +291,26 @@ br i1 %exitcond, label %ret, label %loop ret: ret i64 %s2 -;AVX-LABEL:@loopclearence -;Registers 4-7 are not used and therefore one of them should be chosen -;AVX-NOT: {{%xmm[4-7]}} -;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} -;AVX-NOT: [[XMM4_7]] +fake: + ret i64 0 } ; Make sure we are making a smart choice regarding undef registers even for more ; complicated loop structures. This example is the inner loop from ; julia> a = falses(10000); a[1:4:end] = true ; julia> linspace(1.0,2.0,10000)[a] -define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { +define double @loopclearance2(double %z, double %c1, double %c2, double %c3, double %c4, double %c5, + double* nocapture %y, i64* %x, i64 %size) { entry: - tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() - tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() - br label %loop + %fadd = fadd double %c4, %c5 + ;AVX-LABEL:@loopclearance2 +; AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] +; AVX-NEXT: vucomisd [[XMM6]], %xmm + %cmp1 = fcmp ult double %fadd, 0.0 + br i1 %cmp1, label %loop, label %fake + +fake: + ret double %z loop: %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] @@ -323,13 +337,14 @@ ; the only reasonable choice. The primary thing we care about is that it's ; not one of the registers used in the loop (e.g. not the output reg here) ;AVX-NOT: %xmm6 -;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} +;AVX-NOT: vxorps +;AVX-NOT: vxorpd +;AVX: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} ;AVX-NOT: %xmm6 %nexti_f = sitofp i64 %nexti to double %sub = fsub double %c1, %nexti_f %mul = fmul double %sub, %c2 -;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} -;AVX-NOT: %xmm6 +;AVX: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} %phi_f = sitofp i64 %phi to double %mul2 = fmul double %phi_f, %c3 %add2 = fadd double %mul, %mul2 @@ -341,6 +356,38 @@ br i1 %done, label %loopdone, label %loop loopdone: + ret double 0.0 +} + +; Make sure that calls kill register clearance and that a we don't insert +; an extra dependency-breaking instruction if one suffices. +declare double @sin(double %x) +define void @callclearance(double *%x, i64 *%y, i64 *%z) { +entry: + br label %loop + +loop: + %idx = phi i32 [0, %entry], [%idx, %loop] + %valptr = getelementptr i64, i64* %y, i32 %idx + %valptr2 = getelementptr i64, i64* %z, i32 %idx + %outptr = getelementptr double, double* %x, i32 %idx +;AVX-LABEL:@callclearance +;AVX: vxorps [[THEXMM:%xmm[0-9]+]], [[THEXMM]], [[THEXMM]] +;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}} +;AVX-NOT: vxorps +;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}} + %val = load i64, i64 *%valptr + %val_f = sitofp i64 %val to double + %val2 = load i64, i64 *%valptr2 + %val2_f = sitofp i64 %val2 to double + %sined = call double @sin(double %val_f) + %sined2 = call double @sin(double %val2_f) + %sum = fadd double %sined, %sined2 + store double %sum, double *%x + %done = icmp sgt i32 %idx, 10000 + br i1 %done, label %end, label %loop + +end: ret void } Index: test/CodeGen/X86/combine-fcopysign.ll =================================================================== --- test/CodeGen/X86/combine-fcopysign.ll +++ test/CodeGen/X86/combine-fcopysign.ll @@ -228,6 +228,7 @@ ; SSE-LABEL: combine_vec_fcopysign_fpext_sgn: ; SSE: # BB#0: ; SSE-NEXT: movaps %xmm2, %xmm3 +; SSE-NEXT: xorps %xmm4, %xmm4 ; SSE-NEXT: cvtss2sd %xmm2, %xmm4 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] ; SSE-NEXT: movaps %xmm2, %xmm6 @@ -282,6 +283,7 @@ ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: movaps {{.*#+}} xmm5 ; SSE-NEXT: andps %xmm5, %xmm0 +; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsd2ss %xmm1, %xmm6 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00] ; SSE-NEXT: andps %xmm4, %xmm6 Index: test/CodeGen/X86/fold-load-unops.ll =================================================================== --- test/CodeGen/X86/fold-load-unops.ll +++ test/CodeGen/X86/fold-load-unops.ll @@ -88,6 +88,7 @@ ; ; AVX-LABEL: rcpss_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a @@ -105,6 +106,7 @@ ; ; AVX-LABEL: rsqrtss_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a @@ -122,6 +124,7 @@ ; ; AVX-LABEL: sqrtss_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load float, float* %a @@ -139,6 +142,7 @@ ; ; AVX-LABEL: sqrtsd_size: ; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 ; AVX-NEXT: retq %ld = load double, double* %a Index: test/CodeGen/X86/half.ll =================================================================== --- test/CodeGen/X86/half.ll +++ test/CodeGen/X86/half.ll @@ -101,13 +101,15 @@ ; CHECK-LIBCALL-NEXT: pushq [[ADDR:%[a-z]+]] ; CHECK-LIBCALL-NEXT: movq %rsi, [[ADDR]] +; CHECK-LIBCALL-NEXT: xorps ; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee ; CHECK-LIBCALL-NEXT: movw %ax, ([[ADDR]]) ; CHECK_LIBCALL-NEXT: popq [[ADDR]] ; CHECK_LIBCALL-NEXT: retq -; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0:%[a-z0-9]+]], [[REG0]] +; CHECK-F16C-NEXT: vxorps [[REG0:%[a-z0-9]+]], [[REG0]], [[REG0]] +; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG0]], [[REG0]] ; CHECK-F16C-NEXT: vcvtps2ph $4, [[REG0]], [[REG0]] ; CHECK-F16C-NEXT: vmovd [[REG0]], %eax ; CHECK-F16C-NEXT: movw %ax, (%rsi) @@ -161,7 +163,9 @@ ; CHECK-NEXT: js [[LABEL1:.LBB[0-9_]+]] ; simple conversion to float if non-negative +; CHECK-LIBCALL-NEXT: xorps ; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]] +; CHECK-F16C-NEXT: vxorps ; CHECK-F16C-NEXT: vcvtsi2ssq %rdi, [[REG1:%[a-z0-9]+]], [[REG1]] ; CHECK-NEXT: jmp [[LABEL2:.LBB[0-9_]+]] @@ -171,8 +175,10 @@ ; CHECK-NEXT: shrq %rax ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: orq %rax, [[REG2:%[a-z0-9]+]] +; CHECK-LIBCALL-NEXT: xorps ; CHECK-LIBCALL-NEXT: cvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]] ; CHECK-LIBCALL-NEXT: addss [[REG3]], [[REG1]] +; CHECK-F16C-NEXT: vxorps ; CHECK-F16C-NEXT: vcvtsi2ssq [[REG2]], [[REG3:%[a-z0-9]+]], [[REG3]] ; CHECK-F16C-NEXT: vaddss [[REG3]], [[REG3]], [[REG1:[%a-z0-9]+]] @@ -287,6 +293,7 @@ ; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee ; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp) +; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 ; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee ; CHECK-LIBCALL-NEXT: movzwl %ax, %edi @@ -299,6 +306,7 @@ ; CHECK-F16C-NEXT: movswl (%rsi), %eax ; CHECK-F16C-NEXT: vmovd %eax, %xmm0 ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; CHECK-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 Index: test/CodeGen/X86/i64-to-float.ll =================================================================== --- test/CodeGen/X86/i64-to-float.ll +++ test/CodeGen/X86/i64-to-float.ll @@ -278,9 +278,10 @@ ; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax -; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; X64-AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; X64-AVX-NEXT: vmovq %xmm0, %rax -; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-AVX-NEXT: retq %clo = icmp slt <2 x i64> %a, Index: test/CodeGen/X86/isint.ll =================================================================== --- test/CodeGen/X86/isint.ll +++ test/CodeGen/X86/isint.ll @@ -9,6 +9,7 @@ ; CHECK-NOT: xor ; CHECK: cvt %i = fptosi double %d to i32 +; CHECK-NEXT: xor ; CHECK-NEXT: cvt %e = sitofp i32 %i to double ; CHECK: cmpeqsd @@ -26,6 +27,7 @@ ; CHECK-NOT: xor ; CHECK: cvt %i = fptosi float %f to i32 +; CHECK-NEXT: xor ; CHECK-NEXT: cvt %g = sitofp i32 %i to float ; CHECK: cmpeqss @@ -43,6 +45,7 @@ ; CHECK-LABEL: isint_branch: ; CHECK: cvt %i = fptosi double %d to i32 +; CHECK-NEXT: xor ; CHECK-NEXT: cvt %e = sitofp i32 %i to double ; CHECK: ucomisd Index: test/CodeGen/X86/recip-fastmath.ll =================================================================== --- test/CodeGen/X86/recip-fastmath.ll +++ test/CodeGen/X86/recip-fastmath.ll @@ -30,6 +30,7 @@ define float @f32_one_step(float %x) #1 { ; SSE-LABEL: f32_one_step: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -55,6 +56,7 @@ define float @f32_two_step(float %x) #2 { ; SSE-LABEL: f32_two_step: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 @@ -267,4 +269,3 @@ attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" } attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } - Index: test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- test/CodeGen/X86/recip-fastmath2.ll +++ test/CodeGen/X86/recip-fastmath2.ll @@ -23,6 +23,7 @@ define float @f32_one_step_2(float %x) #1 { ; SSE-LABEL: f32_one_step_2: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm0 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero @@ -50,6 +51,7 @@ define float @f32_two_step_2(float %x) #2 { ; SSE-LABEL: f32_two_step_2: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: rcpss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: mulss %xmm2, %xmm3 @@ -271,4 +273,3 @@ attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" } attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" } attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" } - Index: test/CodeGen/X86/sqrt-fastmath-tune.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath-tune.ll +++ test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -13,7 +13,8 @@ define float @foo_x1(float %f) #0 { ; SCALAR-EST-LABEL: foo_x1: ; SCALAR-EST: # BB#0: -; SCALAR-EST-NEXT: rsqrtss %xmm0 +; SCALAR-EST-NEXT: xorps %xmm1 +; SCALAR-EST-NEXT: rsqrtss %xmm0, %xmm1 ; SCALAR-EST: retq ; ; SCALAR-ACC-LABEL: foo_x1: Index: test/CodeGen/X86/sqrt-fastmath.ll =================================================================== --- test/CodeGen/X86/sqrt-fastmath.ll +++ test/CodeGen/X86/sqrt-fastmath.ll @@ -57,6 +57,7 @@ define float @finite_f32_estimate(float %f) #1 { ; SSE-LABEL: finite_f32_estimate: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm1, %xmm2 @@ -111,6 +112,7 @@ define float @f32_no_estimate(float %x) #0 { ; SSE-LABEL: f32_no_estimate: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: sqrtss %xmm0, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: divss %xmm1, %xmm0 @@ -130,6 +132,7 @@ define float @f32_estimate(float %x) #1 { ; SSE-LABEL: f32_estimate: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: mulss %xmm2, %xmm2 @@ -263,4 +266,3 @@ attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" } attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" } attributes #2 = { nounwind readnone } - Index: test/CodeGen/X86/sse-scalar-fp-arith.ll =================================================================== --- test/CodeGen/X86/sse-scalar-fp-arith.ll +++ test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -81,12 +81,14 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a) { ; SSE2-LABEL: test_sqrt_ss: ; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: sqrtss %xmm0, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_sqrt_ss: ; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: sqrtss %xmm0, %xmm1 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE41-NEXT: retq @@ -180,12 +182,14 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a) { ; SSE2-LABEL: test_sqrt_sd: ; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: sqrtsd %xmm0, %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_sqrt_sd: ; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 ; SSE41-NEXT: sqrtsd %xmm0, %xmm1 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE41-NEXT: retq Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -1258,12 +1258,14 @@ ; X32-LABEL: test_mm_cvtsi32_sd: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: xorps %xmm1, %xmm1 ; X32-NEXT: cvtsi2sdl %eax, %xmm1 ; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_cvtsi32_sd: ; X64: # BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: cvtsi2sdl %edi, %xmm1 ; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X64-NEXT: retq @@ -3878,4 +3880,3 @@ } !0 = !{i32 1} - Index: test/CodeGen/X86/sse_partial_update.ll =================================================================== --- test/CodeGen/X86/sse_partial_update.ll +++ test/CodeGen/X86/sse_partial_update.ll @@ -12,8 +12,10 @@ ; CHECK-LABEL: rsqrtss: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: rsqrtss %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee ## TAILCALL @@ -34,8 +36,10 @@ ; CHECK-LABEL: rcpss: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: rcpss %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee ## TAILCALL @@ -55,8 +59,10 @@ ; CHECK-LABEL: sqrtss: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: sqrtss %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 ; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee ## TAILCALL @@ -76,8 +82,10 @@ ; CHECK-LABEL: sqrtsd: ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: sqrtsd %xmm0, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee2 ## TAILCALL @@ -129,4 +137,3 @@ } declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone - Index: test/CodeGen/X86/uint64-to-float.ll =================================================================== --- test/CodeGen/X86/uint64-to-float.ll +++ test/CodeGen/X86/uint64-to-float.ll @@ -33,6 +33,7 @@ ; X64-NEXT: testq %rdi, %rdi ; X64-NEXT: js .LBB0_1 ; X64-NEXT: # BB#2: # %entry +; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssq %rdi, %xmm0 ; X64-NEXT: retq ; X64-NEXT: .LBB0_1: @@ -40,6 +41,7 @@ ; X64-NEXT: shrq %rax ; X64-NEXT: andl $1, %edi ; X64-NEXT: orq %rax, %rdi +; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssq %rdi, %xmm0 ; X64-NEXT: addss %xmm0, %xmm0 ; X64-NEXT: retq Index: test/CodeGen/X86/uint_to_fp.ll =================================================================== --- test/CodeGen/X86/uint_to_fp.ll +++ test/CodeGen/X86/uint_to_fp.ll @@ -9,6 +9,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shrl $23, %ecx +; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: cvtsi2ssl %ecx, %xmm0 ; X32-NEXT: movss %xmm0, (%eax) ; X32-NEXT: retl @@ -16,6 +17,7 @@ ; X64-LABEL: test: ; X64: ## BB#0: ## %entry ; X64-NEXT: shrl $23, %edi +; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: cvtsi2ssl %edi, %xmm0 ; X64-NEXT: movss %xmm0, (%rsi) ; X64-NEXT: retq Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -20,6 +20,7 @@ ; SSE-LABEL: sitofp_2i64_to_2f64: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -32,7 +33,8 @@ ; VEX-LABEL: sitofp_2i64_to_2f64: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -41,7 +43,8 @@ ; AVX512F-LABEL: sitofp_2i64_to_2f64: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -50,7 +53,8 @@ ; AVX512VL-LABEL: sitofp_2i64_to_2f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -221,6 +225,7 @@ ; SSE-LABEL: sitofp_4i64_to_4f64: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -228,6 +233,7 @@ ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2sdq %rax, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -242,7 +248,8 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -258,7 +265,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -274,7 +282,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -290,7 +299,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -478,7 +488,8 @@ ; AVX512F-LABEL: uitofp_2i64_to_2f64: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -487,7 +498,8 @@ ; AVX512VL-LABEL: uitofp_2i64_to_2f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -807,7 +819,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -823,7 +836,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -1042,6 +1056,7 @@ ; SSE-LABEL: sitofp_2i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movd %xmm0, %rax @@ -1054,7 +1069,8 @@ ; VEX-LABEL: sitofp_2i64_to_4f32: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1065,7 +1081,8 @@ ; AVX512F-LABEL: sitofp_2i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1076,7 +1093,8 @@ ; AVX512VL-LABEL: sitofp_2i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1117,7 +1135,8 @@ ; VEX-LABEL: sitofp_2i64_to_4f32_zero: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero @@ -1126,7 +1145,8 @@ ; AVX512F-LABEL: sitofp_2i64_to_4f32_zero: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero @@ -1135,7 +1155,8 @@ ; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1161,8 +1182,10 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-LABEL: sitofp_4i64_to_4f32_undef: ; SSE: # BB#0: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] @@ -1177,7 +1200,8 @@ ; VEX-LABEL: sitofp_4i64_to_4f32_undef: ; VEX: # BB#0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1188,7 +1212,8 @@ ; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1199,7 +1224,8 @@ ; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1356,8 +1382,10 @@ ; SSE-LABEL: sitofp_4i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] @@ -1376,9 +1404,10 @@ ; AVX1-LABEL: sitofp_4i64_to_4f32: ; AVX1: # BB#0: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -1393,9 +1422,10 @@ ; AVX2-LABEL: sitofp_4i64_to_4f32: ; AVX2: # BB#0: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -1410,9 +1440,10 @@ ; AVX512F-LABEL: sitofp_4i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -1426,9 +1457,10 @@ ; AVX512VL-LABEL: sitofp_4i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -1636,6 +1668,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_1 ; VEX-NEXT: # BB#2: +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB39_3 ; VEX-NEXT: .LBB39_1: @@ -1643,6 +1676,7 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB39_3: @@ -1650,14 +1684,16 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: jmp .LBB39_6 ; VEX-NEXT: .LBB39_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1673,7 +1709,8 @@ ; AVX512F-LABEL: uitofp_2i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1684,7 +1721,8 @@ ; AVX512VL-LABEL: uitofp_2i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1754,6 +1792,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB40_1 ; VEX-NEXT: # BB#2: +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB40_3 ; VEX-NEXT: .LBB40_1: @@ -1761,6 +1800,7 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB40_3: @@ -1768,7 +1808,8 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB40_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; VEX-NEXT: retq ; VEX-NEXT: .LBB40_4: @@ -1776,7 +1817,8 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; VEX-NEXT: retq @@ -1784,7 +1826,8 @@ ; AVX512F-LABEL: uitofp_2i64_to_2f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero @@ -1793,7 +1836,8 @@ ; AVX512VL-LABEL: uitofp_2i64_to_2f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1870,6 +1914,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB41_1 ; VEX-NEXT: # BB#2: +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB41_3 ; VEX-NEXT: .LBB41_1: @@ -1877,6 +1922,7 @@ ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB41_3: @@ -1884,14 +1930,16 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB41_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: jmp .LBB41_6 ; VEX-NEXT: .LBB41_4: ; VEX-NEXT: movq %rax, %rcx ; VEX-NEXT: shrq %rcx ; VEX-NEXT: andl $1, %eax ; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 +; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB41_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1907,7 +1955,8 @@ ; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1918,7 +1967,8 @@ ; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -2124,6 +2174,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_1 ; SSE-NEXT: # BB#2: +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB47_3 ; SSE-NEXT: .LBB47_1: @@ -2131,6 +2182,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB47_3: @@ -2138,6 +2190,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_4 ; SSE-NEXT: # BB#5: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB47_6 ; SSE-NEXT: .LBB47_4: @@ -2145,6 +2198,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB47_6: @@ -2194,6 +2248,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_1 ; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB47_3 ; AVX1-NEXT: .LBB47_1: @@ -2201,6 +2256,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB47_3: @@ -2208,6 +2264,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_4 ; AVX1-NEXT: # BB#5: +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB47_6 ; AVX1-NEXT: .LBB47_4: @@ -2215,6 +2272,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB47_6: @@ -2224,14 +2282,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB47_9 ; AVX1-NEXT: .LBB47_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB47_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2239,7 +2299,8 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB47_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2248,7 +2309,8 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -2260,6 +2322,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_1 ; AVX2-NEXT: # BB#2: +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB47_3 ; AVX2-NEXT: .LBB47_1: @@ -2267,6 +2330,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB47_3: @@ -2274,6 +2338,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_4 ; AVX2-NEXT: # BB#5: +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB47_6 ; AVX2-NEXT: .LBB47_4: @@ -2281,6 +2346,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB47_6: @@ -2290,14 +2356,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB47_9 ; AVX2-NEXT: .LBB47_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB47_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2305,7 +2373,8 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB47_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2314,7 +2383,8 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -2323,9 +2393,10 @@ ; AVX512F-LABEL: uitofp_4i64_to_4f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -2339,9 +2410,10 @@ ; AVX512VL-LABEL: uitofp_4i64_to_4f32: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -2563,6 +2635,7 @@ ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax @@ -2575,7 +2648,8 @@ ; VEX: # BB#0: ; VEX-NEXT: vmovdqa (%rdi), %xmm0 ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; VEX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax ; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2585,7 +2659,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2595,7 +2670,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2704,6 +2780,7 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movd %xmm1, %rax @@ -2725,7 +2802,8 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2742,7 +2820,8 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2759,7 +2838,8 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2776,7 +2856,8 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -2904,7 +2985,8 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -2914,7 +2996,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -3174,7 +3257,8 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -3191,7 +3275,8 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -3348,8 +3433,10 @@ ; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: xorps %xmm3, %xmm3 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] @@ -3368,9 +3455,10 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -3386,9 +3474,10 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -3404,9 +3493,10 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -3421,9 +3511,10 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -3512,8 +3603,10 @@ ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: movd %xmm2, %rax +; SSE-NEXT: xorps %xmm5, %xmm5 ; SSE-NEXT: cvtsi2ssq %rax, %xmm5 ; SSE-NEXT: movd %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] @@ -3550,9 +3643,10 @@ ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax @@ -3581,9 +3675,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax @@ -3612,13 +3707,14 @@ ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 @@ -3643,13 +3739,14 @@ ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 @@ -3790,6 +3887,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_1 ; SSE-NEXT: # BB#2: +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB76_3 ; SSE-NEXT: .LBB76_1: @@ -3797,6 +3895,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB76_3: @@ -3804,6 +3903,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_4 ; SSE-NEXT: # BB#5: +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB76_6 ; SSE-NEXT: .LBB76_4: @@ -3811,6 +3911,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB76_6: @@ -3860,6 +3961,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_1 ; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB76_3 ; AVX1-NEXT: .LBB76_1: @@ -3867,6 +3969,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB76_3: @@ -3874,6 +3977,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_4 ; AVX1-NEXT: # BB#5: +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB76_6 ; AVX1-NEXT: .LBB76_4: @@ -3881,6 +3985,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB76_6: @@ -3890,14 +3995,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB76_9 ; AVX1-NEXT: .LBB76_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB76_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3905,7 +4012,8 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB76_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -3914,7 +4022,8 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -3927,6 +4036,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_1 ; AVX2-NEXT: # BB#2: +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB76_3 ; AVX2-NEXT: .LBB76_1: @@ -3934,6 +4044,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB76_3: @@ -3941,6 +4052,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_4 ; AVX2-NEXT: # BB#5: +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB76_6 ; AVX2-NEXT: .LBB76_4: @@ -3948,6 +4060,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB76_6: @@ -3957,14 +4070,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB76_9 ; AVX2-NEXT: .LBB76_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB76_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3972,7 +4087,8 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB76_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3981,7 +4097,8 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -3991,9 +4108,10 @@ ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax @@ -4008,9 +4126,10 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax @@ -4151,6 +4270,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_1 ; SSE-NEXT: # BB#2: +; SSE-NEXT: xorps %xmm4, %xmm4 ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: jmp .LBB80_3 ; SSE-NEXT: .LBB80_1: @@ -4158,6 +4278,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm4, %xmm4 ; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: addss %xmm4, %xmm4 ; SSE-NEXT: .LBB80_3: @@ -4165,6 +4286,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_4 ; SSE-NEXT: # BB#5: +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB80_6 ; SSE-NEXT: .LBB80_4: @@ -4172,6 +4294,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB80_6: @@ -4180,6 +4303,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_7 ; SSE-NEXT: # BB#8: +; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: jmp .LBB80_9 ; SSE-NEXT: .LBB80_7: @@ -4187,6 +4311,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm6, %xmm6 ; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: addss %xmm6, %xmm6 ; SSE-NEXT: .LBB80_9: @@ -4211,6 +4336,7 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_13 ; SSE-NEXT: # BB#14: +; SSE-NEXT: xorps %xmm7, %xmm7 ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: jmp .LBB80_15 ; SSE-NEXT: .LBB80_13: @@ -4218,6 +4344,7 @@ ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax +; SSE-NEXT: xorps %xmm7, %xmm7 ; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: addss %xmm7, %xmm7 ; SSE-NEXT: .LBB80_15: @@ -4287,6 +4414,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_1 ; AVX1-NEXT: # BB#2: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB80_3 ; AVX1-NEXT: .LBB80_1: @@ -4294,6 +4422,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB80_3: @@ -4301,6 +4430,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # BB#5: +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: @@ -4308,6 +4438,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_6: @@ -4316,6 +4447,7 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # BB#8: +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: @@ -4323,6 +4455,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_9: @@ -4330,20 +4463,23 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB80_12 ; AVX1-NEXT: .LBB80_10: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB80_12: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_13 ; AVX1-NEXT: # BB#14: +; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: jmp .LBB80_15 ; AVX1-NEXT: .LBB80_13: @@ -4351,6 +4487,7 @@ ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax +; AVX1-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: @@ -4359,14 +4496,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_18: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] @@ -4376,14 +4515,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX1-NEXT: .LBB80_21: ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] @@ -4392,14 +4533,16 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB80_24: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -4414,6 +4557,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_1 ; AVX2-NEXT: # BB#2: +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB80_3 ; AVX2-NEXT: .LBB80_1: @@ -4421,6 +4565,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB80_3: @@ -4428,6 +4573,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # BB#5: +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: @@ -4435,6 +4581,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_6: @@ -4443,6 +4590,7 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # BB#8: +; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: @@ -4450,6 +4598,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_9: @@ -4457,20 +4606,23 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB80_12 ; AVX2-NEXT: .LBB80_10: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB80_12: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_13 ; AVX2-NEXT: # BB#14: +; AVX2-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: jmp .LBB80_15 ; AVX2-NEXT: .LBB80_13: @@ -4478,6 +4630,7 @@ ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax +; AVX2-NEXT: vxorps %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: @@ -4486,14 +4639,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_18: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] @@ -4503,14 +4658,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX2-NEXT: .LBB80_21: ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] @@ -4519,14 +4676,16 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB80_24: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -4538,13 +4697,14 @@ ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 @@ -4569,13 +4729,14 @@ ; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 Index: test/CodeGen/X86/vector-sqrt.ll =================================================================== --- test/CodeGen/X86/vector-sqrt.ll +++ test/CodeGen/X86/vector-sqrt.ll @@ -4,7 +4,7 @@ ; Function Attrs: nounwind readonly uwtable define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtd2: -; CHECK: vsqrtsd (%rdi), %xmm0, %xmm0 +; CHECK: vsqrtsd (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: vsqrtsd 8(%rdi), %xmm1, %xmm1 ; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq @@ -25,9 +25,9 @@ ; Function Attrs: nounwind readonly uwtable define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK-LABEL: sqrtf4: -; CHECK: vsqrtss (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 +; CHECK: vsqrtss (%rdi), %xmm3, %xmm0 +; CHECK-NEXT: vsqrtss 4(%rdi), %xmm3, %xmm1 +; CHECK-NEXT: vsqrtss 8(%rdi), %xmm3, %xmm2 ; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]