Index: llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp +++ llvm/trunk/lib/CodeGen/ExecutionDepsFix.cpp @@ -203,6 +203,8 @@ void processDefs(MachineInstr*, bool Kill); void visitSoftInstr(MachineInstr*, unsigned mask); void visitHardInstr(MachineInstr*, unsigned domain); + void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref); bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); void processUndefReads(MachineBasicBlock*); }; @@ -473,6 +475,56 @@ processDefs(MI, !DomP.first); } +/// \brief Helps avoid false dependencies on undef registers by updating the +/// machine instructions' undef operand to use a register that the instruction +/// is truly dependent on, or use a register with clearance higher than Pref. +void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { + MachineOperand &MO = MI->getOperand(OpIdx); + assert(MO.isUndef() && "Expected undef machine operand"); + + unsigned OriginalReg = MO.getReg(); + + // Update only undef operands that are mapped to one register. + if (AliasMap[OriginalReg].size() != 1) + return; + + // Get the undef operand's register class + const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); + + // If the instruction has a true dependency, we can hide the false depdency + // behind it. + for (MachineOperand &CurrMO : MI->operands()) { + if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || + !OpRC->contains(CurrMO.getReg())) + continue; + // We found a true dependency - replace the undef register with the true + // dependency. + MO.setReg(CurrMO.getReg()); + return; + } + + // Go over all registers in the register class and find the register with + // max clearance or clearance higher than Pref. + unsigned MaxClearance = 0; + unsigned MaxClearanceReg = OriginalReg; + for (unsigned rx = 0; rx < OpRC->getNumRegs(); ++rx) { + unsigned Clearance = CurInstr - LiveRegs[rx].Def; + if (Clearance <= MaxClearance) + continue; + MaxClearance = Clearance; + MaxClearanceReg = OpRC->getRegister(rx); + + if (MaxClearance > Pref) + break; + } + + // Update the operand if we found a register with better clearance. + if (MaxClearanceReg != OriginalReg) + MO.setReg(MaxClearanceReg); +} + /// \brief Return true to if it makes sense to break dependence on a partial def /// or undef use. bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, @@ -510,6 +562,7 @@ unsigned OpNum; unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); if (Pref) { + pickBestRegisterForUndef(MI, OpNum, Pref); if (shouldBreakDependence(MI, OpNum, Pref)) UndefReads.push_back(std::make_pair(MI, OpNum)); } Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -68,7 +68,7 @@ UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), - cl::init(64), cl::Hidden); + cl::init(128), cl::Hidden); enum { // Select which memory operand is being unfolded. Index: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll +++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll @@ -16,28 +16,27 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 ; KNL-NEXT: vmovq %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 ; KNL-NEXT: vpextrq $1, %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; KNL-NEXT: vmovq %xmm2, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -56,15 +55,14 @@ ; KNL: ## BB#0: ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpextrq $1, %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; KNL-NEXT: vmovq %xmm1, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: retq @@ -81,12 +79,11 @@ ; KNL-LABEL: sltof2f32: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; KNL-NEXT: retq @@ -105,17 +102,16 @@ ; KNL: ## BB#0: ; KNL-NEXT: vmovdqu (%rdi), %ymm0 ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; KNL-NEXT: retq ; @@ -186,17 +182,16 @@ ; KNL-LABEL: sltof432: ; KNL: ## BB#0: ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; KNL-NEXT: retq ; @@ -884,12 +879,11 @@ ; KNL-NEXT: movl $-1, %eax ; KNL-NEXT: movl $0, %edx ; KNL-NEXT: cmovnel %eax, %edx -; KNL-NEXT: vcvtsi2ssl %edx, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2ssl %edx, %xmm2, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rdx ; KNL-NEXT: testb $1, %dl ; KNL-NEXT: cmovnel %eax, %ecx -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2ssl %ecx, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; KNL-NEXT: retq ; @@ -1091,11 +1085,10 @@ ; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; KNL-NEXT: vpextrq $1, %xmm0, %rax ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm1 +; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1 ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 +; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; KNL-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/break-false-dep.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/break-false-dep.ll +++ llvm/trunk/test/CodeGen/X86/break-false-dep.ll @@ -126,6 +126,7 @@ %i = phi i64 [ 1, %entry ], [ %inc, %loop ] %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] %fi = sitofp i64 %i to double + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() %vy = load double, double* %y %fipy = fadd double %fi, %vy %iipy = fptosi double %fipy to i64 @@ -174,6 +175,7 @@ store double %mul11, double* %arrayidx13, align 8 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 1024 + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() br i1 %exitcond, label %for.inc14, label %for.body3 for.inc14: ; preds = %for.body3 @@ -193,7 +195,7 @@ ;SSE-NEXT: movsd [[XMM0]], ;AVX-LABEL:@loopdep3 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] -;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]] +;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] @@ -202,10 +204,76 @@ define double @inlineasmdep(i64 %arg) { top: - tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 ;AVX-LABEL:@inlineasmdep ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] ;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} } + +; Make sure we are making a smart choice regarding undef registers and +; hiding the false dependency behind a true dependency +define double @truedeps(float %arg) { +top: + tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + %tmp1 = fpext float %arg to double + ret double %tmp1 +;AVX-LABEL:@truedeps +;AVX-NOT: vxorps +;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} +} + +; Make sure we are making a smart choice regarding undef registers and +; choosing the register with the highest clearence +define double @clearence(i64 %arg) { +top: + tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + %tmp1 = sitofp i64 %arg to double + ret double %tmp1 +;AVX-LABEL:@clearence +;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] +;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} +} + +; Make sure we are making a smart choice regarding undef registers in order to +; avoid a cyclic dependence on a write to the same register in a previous +; iteration, especially when we cannot zero out the undef register because it +; is alive. +define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i64, i64* %x + br label %loop +loop: + %i = phi i64 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + %fi = sitofp i64 %i to double + tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + %vy = load double, double* %y + %fipy = fadd double %fi, %vy + %iipy = fptosi double %fipy to i64 + %s2 = add i64 %s1, %iipy + %inc = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i64 %s2 +;AVX-LABEL:@loopclearence +;Registers 4-7 are not used and therefore one of them should be chosen +;AVX-NOT: {{%xmm[4-7]}} +;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} +;AVX-NOT: [[XMM4_7]] +} Index: llvm/trunk/test/CodeGen/X86/copy-propagation.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/copy-propagation.ll +++ llvm/trunk/test/CodeGen/X86/copy-propagation.ll @@ -26,7 +26,7 @@ ; Copy the result in a temporary. ; Note: Technically the regalloc could have been smarter and this move not required, ; which would have hidden the bug. -; CHECK-NEXT: vmovapd %xmm0, [[TMP:%xmm[0-9]+]] +; CHECK: vmovapd %xmm0, [[TMP:%xmm[0-9]+]] ; Crush xmm0. ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK: movl $339772768, %e[[INDIRECT_CALL2:[a-z]+]] @@ -37,6 +37,7 @@ define double @foo(i64 %arg) { top: %tmp = call double inttoptr (i64 339752784 to double (double, double)*)(double 1.000000e+00, double 0.000000e+00) + tail call void asm sideeffect "", "x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(double %tmp) %tmp1 = sitofp i64 %arg to double call void inttoptr (i64 339772768 to void (double, double)*)(double %tmp, double %tmp1) %tmp3 = fadd double %tmp1, %tmp Index: llvm/trunk/test/CodeGen/X86/half.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/half.ll +++ llvm/trunk/test/CodeGen/X86/half.ll @@ -299,7 +299,7 @@ ; CHECK-F16C-NEXT: movswl (%rsi), %eax ; CHECK-F16C-NEXT: vmovd %eax, %xmm0 ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm1 +; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 Index: llvm/trunk/test/CodeGen/X86/sse-fsignum.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse-fsignum.ll +++ llvm/trunk/test/CodeGen/X86/sse-fsignum.ll @@ -39,16 +39,15 @@ ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rdi) @@ -60,16 +59,15 @@ ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rdi) Index: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll +++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll @@ -28,10 +28,9 @@ ; AVX-LABEL: sitofp_2i64_to_2f64: ; AVX: # BB#0: ; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq %cvt = sitofp <2 x i64> %a to <2 x double> @@ -209,15 +208,14 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -226,15 +224,14 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -243,15 +240,14 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -941,12 +937,11 @@ ; AVX-LABEL: sitofp_2i64_to_4f32: ; AVX: # BB#0: ; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-NEXT: retq @@ -974,12 +969,11 @@ ; AVX-LABEL: sitofp_4i64_to_4f32_undef: ; AVX: # BB#0: ; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-NEXT: retq @@ -1140,17 +1134,16 @@ ; AVX1-LABEL: sitofp_4i64_to_4f32: ; AVX1: # BB#0: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1158,17 +1151,16 @@ ; AVX2-LABEL: sitofp_4i64_to_4f32: ; AVX2: # BB#0: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1176,17 +1168,16 @@ ; AVX512-LABEL: sitofp_4i64_to_4f32: ; AVX512: # BB#0: ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512-NEXT: retq %cvt = sitofp <4 x i64> %a to <4 x float> @@ -1377,12 +1368,12 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB38_1 ; VEX-NEXT: # BB#2: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB38_3 ; VEX-NEXT: .LBB38_1: ; VEX-NEXT: shrq %rax ; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB38_3: ; VEX-NEXT: vmovq %xmm0, %rax @@ -1391,14 +1382,12 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB38_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: jmp .LBB38_6 ; VEX-NEXT: .LBB38_4: ; VEX-NEXT: shrq %rax ; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB38_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1406,7 +1395,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB38_8 ; VEX-NEXT: # BB#7: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: .LBB38_8: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -1485,12 +1474,12 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_1 ; VEX-NEXT: # BB#2: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; VEX-NEXT: jmp .LBB39_3 ; VEX-NEXT: .LBB39_1: ; VEX-NEXT: shrq %rax ; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; VEX-NEXT: .LBB39_3: ; VEX-NEXT: vmovq %xmm0, %rax @@ -1499,14 +1488,12 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_4 ; VEX-NEXT: # BB#5: -; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0 ; VEX-NEXT: jmp .LBB39_6 ; VEX-NEXT: .LBB39_4: ; VEX-NEXT: shrq %rax ; VEX-NEXT: orq %rax, %rcx -; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; VEX-NEXT: .LBB39_6: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] @@ -1514,7 +1501,7 @@ ; VEX-NEXT: testq %rax, %rax ; VEX-NEXT: js .LBB39_8 ; VEX-NEXT: # BB#7: -; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1 ; VEX-NEXT: .LBB39_8: ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] @@ -1782,12 +1769,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_1 ; AVX1-NEXT: # BB#2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB45_3 ; AVX1-NEXT: .LBB45_1: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB45_3: ; AVX1-NEXT: vmovq %xmm0, %rax @@ -1796,12 +1783,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB45_6 ; AVX1-NEXT: .LBB45_4: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB45_6: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] @@ -1812,12 +1799,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: jmp .LBB45_9 ; AVX1-NEXT: .LBB45_7: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB45_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -1827,16 +1814,14 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB45_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB45_10: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -1850,12 +1835,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_1 ; AVX2-NEXT: # BB#2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB45_3 ; AVX2-NEXT: .LBB45_1: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB45_3: ; AVX2-NEXT: vmovq %xmm0, %rax @@ -1864,12 +1849,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB45_6 ; AVX2-NEXT: .LBB45_4: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB45_6: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] @@ -1880,12 +1865,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: jmp .LBB45_9 ; AVX2-NEXT: .LBB45_7: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB45_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -1895,16 +1880,14 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB45_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB45_10: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -2118,10 +2101,9 @@ ; VEX: # BB#0: ; VEX-NEXT: vmovdqa (%rdi), %xmm0 ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; @@ -2129,10 +2111,9 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %xmm0 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512-NEXT: retq %ld = load <2 x i64>, <2 x i64> *%a @@ -2231,15 +2212,14 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2249,15 +2229,14 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2267,15 +2246,14 @@ ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -2756,17 +2734,16 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -2775,17 +2752,16 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2794,17 +2770,16 @@ ; AVX512: # BB#0: ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512-NEXT: retq %ld = load <4 x i64>, <4 x i64> *%a @@ -2912,29 +2887,28 @@ ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -2944,29 +2918,28 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2976,29 +2949,28 @@ ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX512-NEXT: vpextrq $1, %xmm2, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -3186,12 +3158,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_1 ; AVX1-NEXT: # BB#2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB74_3 ; AVX1-NEXT: .LBB74_1: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB74_3: ; AVX1-NEXT: vmovq %xmm0, %rax @@ -3200,12 +3172,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX1-NEXT: jmp .LBB74_6 ; AVX1-NEXT: .LBB74_4: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB74_6: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] @@ -3216,12 +3188,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX1-NEXT: jmp .LBB74_9 ; AVX1-NEXT: .LBB74_7: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB74_9: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3231,16 +3203,14 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB74_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; AVX1-NEXT: .LBB74_10: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper @@ -3255,12 +3225,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_1 ; AVX2-NEXT: # BB#2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB74_3 ; AVX2-NEXT: .LBB74_1: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB74_3: ; AVX2-NEXT: vmovq %xmm0, %rax @@ -3269,12 +3239,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 ; AVX2-NEXT: jmp .LBB74_6 ; AVX2-NEXT: .LBB74_4: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB74_6: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] @@ -3285,12 +3255,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 ; AVX2-NEXT: jmp .LBB74_9 ; AVX2-NEXT: .LBB74_7: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB74_9: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -3300,16 +3270,14 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB74_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; AVX2-NEXT: .LBB74_10: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper @@ -3581,12 +3549,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_1 ; AVX1-NEXT: # BB#2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX1-NEXT: jmp .LBB78_3 ; AVX1-NEXT: .LBB78_1: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB78_3: ; AVX1-NEXT: vmovq %xmm2, %rax @@ -3595,12 +3563,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX1-NEXT: jmp .LBB78_6 ; AVX1-NEXT: .LBB78_4: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB78_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -3610,12 +3578,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX1-NEXT: jmp .LBB78_9 ; AVX1-NEXT: .LBB78_7: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB78_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax @@ -3624,12 +3592,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_10 ; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; AVX1-NEXT: jmp .LBB78_12 ; AVX1-NEXT: .LBB78_10: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB78_12: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax @@ -3638,12 +3606,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_13 ; AVX1-NEXT: # BB#14: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: jmp .LBB78_15 ; AVX1-NEXT: .LBB78_13: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB78_15: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] @@ -3653,12 +3621,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX1-NEXT: jmp .LBB78_18 ; AVX1-NEXT: .LBB78_16: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB78_18: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] @@ -3670,14 +3638,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 ; AVX1-NEXT: jmp .LBB78_21 ; AVX1-NEXT: .LBB78_19: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX1-NEXT: .LBB78_21: ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] @@ -3688,12 +3654,12 @@ ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB78_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX1-NEXT: jmp .LBB78_24 ; AVX1-NEXT: .LBB78_22: ; AVX1-NEXT: shrq %rax ; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .LBB78_24: ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] @@ -3710,12 +3676,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_1 ; AVX2-NEXT: # BB#2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 ; AVX2-NEXT: jmp .LBB78_3 ; AVX2-NEXT: .LBB78_1: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB78_3: ; AVX2-NEXT: vmovq %xmm2, %rax @@ -3724,12 +3690,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 ; AVX2-NEXT: jmp .LBB78_6 ; AVX2-NEXT: .LBB78_4: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB78_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 @@ -3739,12 +3705,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 ; AVX2-NEXT: jmp .LBB78_9 ; AVX2-NEXT: .LBB78_7: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB78_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax @@ -3753,12 +3719,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_10 ; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 ; AVX2-NEXT: jmp .LBB78_12 ; AVX2-NEXT: .LBB78_10: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB78_12: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax @@ -3767,12 +3733,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_13 ; AVX2-NEXT: # BB#14: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: jmp .LBB78_15 ; AVX2-NEXT: .LBB78_13: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB78_15: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] @@ -3782,12 +3748,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 ; AVX2-NEXT: jmp .LBB78_18 ; AVX2-NEXT: .LBB78_16: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB78_18: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] @@ -3799,14 +3765,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 ; AVX2-NEXT: jmp .LBB78_21 ; AVX2-NEXT: .LBB78_19: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 ; AVX2-NEXT: .LBB78_21: ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] @@ -3817,12 +3781,12 @@ ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB78_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 ; AVX2-NEXT: jmp .LBB78_24 ; AVX2-NEXT: .LBB78_22: ; AVX2-NEXT: shrq %rax ; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .LBB78_24: ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]