diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32734,8 +32734,29 @@ N->getOpcode() == ISD::STRICT_FP_TO_SINT; EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); EVT SrcVT = Src.getValueType(); + SDValue Res; + if (isSoftFP16(SrcVT)) { + EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; + if (IsStrict) { + Res = + DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, + {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, + {NVT, MVT::Other}, {Chain, Src})}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); + } + Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); + + return; + } + if (VT.isVector() && Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) { EVT EleVT = VT.getVectorElementType(); @@ -32749,7 +32770,6 @@ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); } - SDValue Res, Chain; if (IsStrict) { unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; @@ -32941,7 +32961,6 @@ return; } - SDValue Chain; if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) { Results.push_back(V); if (IsStrict) diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -863,59 +863,47 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -940,82 +928,94 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; CHECK-NEXT: # xmm4 = xmm4[0],mem[0] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: cvttps2dq %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrad $31, %xmm1 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: por %xmm4, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; CHECK-NEXT: pxor %xmm4, %xmm1 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: pslld $16, %xmm3 -; CHECK-NEXT: psrad $16, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pxor %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: pand %xmm0, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 +; CHECK-NEXT: pxor %xmm3, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: pslld $16, %xmm4 +; CHECK-NEXT: psrad $16, %xmm4 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm3, %xmm0 +; CHECK-NEXT: packssdw %xmm4, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -1035,59 +1035,47 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] @@ -2437,59 +2425,47 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2512,82 +2488,94 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; CHECK-NEXT: # xmm4 = xmm4[0],mem[0] +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: cvttps2dq %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrad $31, %xmm1 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: por %xmm4, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; CHECK-NEXT: pxor %xmm4, %xmm1 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: pslld $16, %xmm3 -; CHECK-NEXT: psrad $16, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pxor %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: pand %xmm0, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 +; CHECK-NEXT: pxor %xmm3, %xmm0 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: pxor %xmm2, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: pslld $16, %xmm4 +; CHECK-NEXT: psrad $16, %xmm4 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm3, %xmm0 +; CHECK-NEXT: packssdw %xmm4, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2606,59 +2594,47 @@ ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -228,13 +228,24 @@ ; ; CHECK-I686-LABEL: test_fptosi_i64: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: subl $28, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esp) -; CHECK-I686-NEXT: calll __fixhfdi -; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: calll __extendhfsf2 +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-I686-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-I686-NEXT: addl $28, %esp ; CHECK-I686-NEXT: retl %a = load half, ptr %p, align 2 %r = fptosi half %a to i64 @@ -315,13 +326,36 @@ ; ; CHECK-I686-LABEL: test_fptoui_i64: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $12, %esp +; CHECK-I686-NEXT: subl $28, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esp) -; CHECK-I686-NEXT: calll __fixunshfdi -; CHECK-I686-NEXT: addl $12, %esp +; CHECK-I686-NEXT: calll __extendhfsf2 +; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-I686-NEXT: jae .LBB9_2 +; CHECK-I686-NEXT: # %bb.1: +; CHECK-I686-NEXT: xorps %xmm1, %xmm1 +; CHECK-I686-NEXT: .LBB9_2: +; CHECK-I686-NEXT: subss %xmm1, %xmm0 +; CHECK-I686-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: setae %al +; CHECK-I686-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-I686-NEXT: orl $3072, %ecx # imm = 0xC00 +; CHECK-I686-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movzbl %al, %edx +; CHECK-I686-NEXT: shll $31, %edx +; CHECK-I686-NEXT: xorl {{[0-9]+}}(%esp), %edx +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: addl $28, %esp ; CHECK-I686-NEXT: retl %a = load half, ptr %p, align 2 %r = fptoui half %a to i64 diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -4870,58 +4870,49 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; AVX-LABEL: fptosi_2f16_to_4i32: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vcvttss2si %xmm0, %ebx -; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vcvttss2si %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vmovd %ebx, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx +; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq ; ; F16C-LABEL: fptosi_2f16_to_4i32: ; F16C: # %bb.0: -; F16C-NEXT: vpsrld $16, %xmm0, %xmm1 -; F16C-NEXT: vpextrw $0, %xmm1, %eax +; F16C-NEXT: vpextrw $0, %xmm0, %eax ; F16C-NEXT: movzwl %ax, %eax ; F16C-NEXT: vmovd %eax, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vcvttss2si %xmm1, %eax -; F16C-NEXT: vpextrw $0, %xmm0, %ecx -; F16C-NEXT: movzwl %cx, %ecx -; F16C-NEXT: vmovd %ecx, %xmm0 +; F16C-NEXT: vpsrld $16, %xmm0, %xmm0 +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vcvttss2si %xmm0, %ecx -; F16C-NEXT: vmovd %ecx, %xmm0 -; F16C-NEXT: vmovd %eax, %xmm1 -; F16C-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; F16C-NEXT: vcvttps2dq %xmm0, %xmm0 ; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; F16C-NEXT: retq ; ; AVX512-LABEL: fptosi_2f16_to_4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpextrw $0, %xmm1, %eax +; AVX512-NEXT: vpextrw $0, %xmm0, %eax ; AVX512-NEXT: movzwl %ax, %eax ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvttss2si %xmm1, %eax -; AVX512-NEXT: vpextrw $0, %xmm0, %ecx -; AVX512-NEXT: movzwl %cx, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpextrw $0, %xmm0, %eax +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32>