Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -1746,6 +1746,16 @@ return false; } + /// Return true if it is more correct/profitable to use strict FP_TO_INT + /// conversion operations - canonicalizing the FP source value instead of + /// converting all cases and then selecting based on value. + /// This may be true if the target throws exceptions for out of bounds + /// conversions or has fast FP CMOV. + virtual bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, + bool IsSigned) const { + return false; + } + //===--------------------------------------------------------------------===// // TargetLowering Configuration Methods - These methods should be invoked by // the derived class constructor to configure this object for the target. Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4183,20 +4183,39 @@ return true; } - // Expand based on maximum range of FP_TO_SINT: - // True = fp_to_sint(Src) - // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000) - // Result = select (Src < 0x8000000000000000), True, False SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT); SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT); - SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src); - // TODO: Should any fast-math-flags be set for the FSUB? - SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, - DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst)); - False = DAG.getNode(ISD::XOR, dl, DstVT, False, - DAG.getConstant(SignMask, dl, DstVT)); - Result = DAG.getSelect(dl, DstVT, Sel, True, False); + bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false); + if (Strict) { + // Expand based on maximum range of FP_TO_SINT, if the value exceeds the + // signmask then offset (the result of which should be fully representable). + // Sel = Src < 0x8000000000000000 + // Val = select Sel, Src, Src - 0x8000000000000000 + // Ofs = select Sel, 0, 0x8000000000000000 + // Result = fp_to_sint(Val) ^ Ofs + + // TODO: Should any fast-math-flags be set for the FSUB? + SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst)); + SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), + DAG.getConstant(SignMask, dl, DstVT)); + Result = DAG.getNode(ISD::XOR, dl, DstVT, + DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs); + } else { + // Expand based on maximum range of FP_TO_SINT: + // True = fp_to_sint(Src) + // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000) + // Result = select (Src < 0x8000000000000000), True, False + + SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src); + // TODO: Should any fast-math-flags be set for the FSUB? + SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, + DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst)); + False = DAG.getNode(ISD::XOR, dl, DstVT, False, + DAG.getConstant(SignMask, dl, DstVT)); + Result = DAG.getSelect(dl, DstVT, Sel, True, False); + } return true; } Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1047,6 +1047,9 @@ bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, + bool IsSigned) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4814,6 +4814,12 @@ (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } +bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, + bool IsSigned) const { + // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available. + return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov(); +} + bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) Index: test/CodeGen/X86/fp-cvt.ll =================================================================== --- test/CodeGen/X86/fp-cvt.ll +++ test/CodeGen/X86/fp-cvt.ll @@ -483,29 +483,20 @@ ; X64-X87-NEXT: flds {{.*}}(%rip) ; X64-X87-NEXT: fld %st(1) ; X64-X87-NEXT: fsub %st(1) +; X64-X87-NEXT: xorl %eax, %eax +; X64-X87-NEXT: fxch %st(1) +; X64-X87-NEXT: fucompi %st(2) +; X64-X87-NEXT: fcmovnbe %st(1), %st(0) +; X64-X87-NEXT: fstp %st(1) ; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; X64-X87-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F -; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fistpll -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx ; X64-X87-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fld %st(1) +; X64-X87-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; X64-X87-NEXT: fistpll -{{[0-9]+}}(%rsp) ; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fucompi %st(1) -; X64-X87-NEXT: fstp %st(0) -; X64-X87-NEXT: jbe .LBB10_1 -; X64-X87-NEXT: # %bb.2: -; X64-X87-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-X87-NEXT: retq -; X64-X87-NEXT: .LBB10_1: -; X64-X87-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-X87-NEXT: setbe %al +; X64-X87-NEXT: shlq $63, %rax ; X64-X87-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; X64-X87-NEXT: retq ; @@ -515,17 +506,14 @@ ; X64-SSSE3-NEXT: flds {{.*}}(%rip) ; X64-SSSE3-NEXT: fld %st(1) ; X64-SSSE3-NEXT: fsub %st(1) +; X64-SSSE3-NEXT: xorl %eax, %eax +; X64-SSSE3-NEXT: fxch %st(1) +; X64-SSSE3-NEXT: fucompi %st(2) +; X64-SSSE3-NEXT: fcmovnbe %st(1), %st(0) +; X64-SSSE3-NEXT: fstp %st(1) ; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: fld %st(1) -; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: fucompi %st(1) -; X64-SSSE3-NEXT: fstp %st(0) -; X64-SSSE3-NEXT: jbe .LBB10_1 -; X64-SSSE3-NEXT: # %bb.2: -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-SSSE3-NEXT: retq -; X64-SSSE3-NEXT: .LBB10_1: -; X64-SSSE3-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-SSSE3-NEXT: setbe %al +; X64-SSSE3-NEXT: shlq $63, %rax ; X64-SSSE3-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; X64-SSSE3-NEXT: retq %1 = fptoui x86_fp80 %a0 to i64 @@ -577,29 +565,20 @@ ; X64-X87-NEXT: flds {{.*}}(%rip) ; X64-X87-NEXT: fld %st(1) ; X64-X87-NEXT: fsub %st(1) +; X64-X87-NEXT: xorl %eax, %eax +; X64-X87-NEXT: fxch %st(1) +; X64-X87-NEXT: fucompi %st(2) +; X64-X87-NEXT: fcmovnbe %st(1), %st(0) +; X64-X87-NEXT: fstp %st(1) ; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; X64-X87-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F -; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fistpll -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-X87-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx ; X64-X87-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fld %st(1) +; X64-X87-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; X64-X87-NEXT: fistpll -{{[0-9]+}}(%rsp) ; X64-X87-NEXT: fldcw -{{[0-9]+}}(%rsp) -; X64-X87-NEXT: fucompi %st(1) -; X64-X87-NEXT: fstp %st(0) -; X64-X87-NEXT: jbe .LBB11_1 -; X64-X87-NEXT: # %bb.2: -; X64-X87-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-X87-NEXT: retq -; X64-X87-NEXT: .LBB11_1: -; X64-X87-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-X87-NEXT: setbe %al +; X64-X87-NEXT: shlq $63, %rax ; X64-X87-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; X64-X87-NEXT: retq ; @@ -609,17 +588,14 @@ ; X64-SSSE3-NEXT: flds {{.*}}(%rip) ; X64-SSSE3-NEXT: fld %st(1) ; X64-SSSE3-NEXT: fsub %st(1) +; X64-SSSE3-NEXT: xorl %eax, %eax +; X64-SSSE3-NEXT: fxch %st(1) +; X64-SSSE3-NEXT: fucompi %st(2) +; X64-SSSE3-NEXT: fcmovnbe %st(1), %st(0) +; X64-SSSE3-NEXT: fstp %st(1) ; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: fld %st(1) -; X64-SSSE3-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; X64-SSSE3-NEXT: fucompi %st(1) -; X64-SSSE3-NEXT: fstp %st(0) -; X64-SSSE3-NEXT: jbe .LBB11_1 -; X64-SSSE3-NEXT: # %bb.2: -; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-SSSE3-NEXT: retq -; X64-SSSE3-NEXT: .LBB11_1: -; X64-SSSE3-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-SSSE3-NEXT: setbe %al +; X64-SSSE3-NEXT: shlq $63, %rax ; X64-SSSE3-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; X64-SSSE3-NEXT: retq %1 = load x86_fp80, x86_fp80 *%a0 Index: test/CodeGen/X86/scalar-fp-to-i64.ll =================================================================== --- test/CodeGen/X86/scalar-fp-to-i64.ll +++ test/CodeGen/X86/scalar-fp-to-i64.ll @@ -1147,25 +1147,21 @@ ; ; SSE3_64_WIN-LABEL: x_to_u64: ; SSE3_64_WIN: # %bb.0: -; SSE3_64_WIN-NEXT: subq $16, %rsp +; SSE3_64_WIN-NEXT: pushq %rax ; SSE3_64_WIN-NEXT: fldt (%rcx) ; SSE3_64_WIN-NEXT: flds __real@{{.*}}(%rip) ; SSE3_64_WIN-NEXT: fld %st(1) ; SSE3_64_WIN-NEXT: fsub %st(1) -; SSE3_64_WIN-NEXT: fisttpll {{[0-9]+}}(%rsp) -; SSE3_64_WIN-NEXT: fld %st(1) +; SSE3_64_WIN-NEXT: xorl %eax, %eax +; SSE3_64_WIN-NEXT: fxch %st(1) +; SSE3_64_WIN-NEXT: fucompi %st(2) +; SSE3_64_WIN-NEXT: fcmovnbe %st(1), %st(0) +; SSE3_64_WIN-NEXT: fstp %st(1) ; SSE3_64_WIN-NEXT: fisttpll (%rsp) -; SSE3_64_WIN-NEXT: fucompi %st(1) -; SSE3_64_WIN-NEXT: fstp %st(0) -; SSE3_64_WIN-NEXT: jbe .LBB4_1 -; SSE3_64_WIN-NEXT: # %bb.2: -; SSE3_64_WIN-NEXT: movq (%rsp), %rax -; SSE3_64_WIN-NEXT: addq $16, %rsp -; SSE3_64_WIN-NEXT: retq -; SSE3_64_WIN-NEXT: .LBB4_1: -; SSE3_64_WIN-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE3_64_WIN-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; SSE3_64_WIN-NEXT: addq $16, %rsp +; SSE3_64_WIN-NEXT: setbe %al +; SSE3_64_WIN-NEXT: shlq $63, %rax +; SSE3_64_WIN-NEXT: xorq (%rsp), %rax +; SSE3_64_WIN-NEXT: popq %rcx ; SSE3_64_WIN-NEXT: retq ; ; SSE3_64_LIN-LABEL: x_to_u64: @@ -1174,17 +1170,14 @@ ; SSE3_64_LIN-NEXT: flds {{.*}}(%rip) ; SSE3_64_LIN-NEXT: fld %st(1) ; SSE3_64_LIN-NEXT: fsub %st(1) +; SSE3_64_LIN-NEXT: xorl %eax, %eax +; SSE3_64_LIN-NEXT: fxch %st(1) +; SSE3_64_LIN-NEXT: fucompi %st(2) +; SSE3_64_LIN-NEXT: fcmovnbe %st(1), %st(0) +; SSE3_64_LIN-NEXT: fstp %st(1) ; SSE3_64_LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; SSE3_64_LIN-NEXT: fld %st(1) -; SSE3_64_LIN-NEXT: fisttpll -{{[0-9]+}}(%rsp) -; SSE3_64_LIN-NEXT: fucompi %st(1) -; SSE3_64_LIN-NEXT: fstp %st(0) -; SSE3_64_LIN-NEXT: jbe .LBB4_1 -; SSE3_64_LIN-NEXT: # %bb.2: -; SSE3_64_LIN-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE3_64_LIN-NEXT: retq -; SSE3_64_LIN-NEXT: .LBB4_1: -; SSE3_64_LIN-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE3_64_LIN-NEXT: setbe %al +; SSE3_64_LIN-NEXT: shlq $63, %rax ; SSE3_64_LIN-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; SSE3_64_LIN-NEXT: retq ; @@ -1246,37 +1239,27 @@ ; ; SSE2_64_WIN-LABEL: x_to_u64: ; SSE2_64_WIN: # %bb.0: -; SSE2_64_WIN-NEXT: subq $24, %rsp +; SSE2_64_WIN-NEXT: subq $16, %rsp ; SSE2_64_WIN-NEXT: fldt (%rcx) ; SSE2_64_WIN-NEXT: flds __real@{{.*}}(%rip) ; SSE2_64_WIN-NEXT: fld %st(1) ; SSE2_64_WIN-NEXT: fsub %st(1) +; SSE2_64_WIN-NEXT: xorl %eax, %eax +; SSE2_64_WIN-NEXT: fxch %st(1) +; SSE2_64_WIN-NEXT: fucompi %st(2) +; SSE2_64_WIN-NEXT: fcmovnbe %st(1), %st(0) +; SSE2_64_WIN-NEXT: fstp %st(1) ; SSE2_64_WIN-NEXT: fnstcw {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2_64_WIN-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx ; SSE2_64_WIN-NEXT: movw $3199, {{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE2_64_WIN-NEXT: fldcw {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; SSE2_64_WIN-NEXT: movw %cx, {{[0-9]+}}(%rsp) ; SSE2_64_WIN-NEXT: fistpll {{[0-9]+}}(%rsp) ; SSE2_64_WIN-NEXT: fldcw {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: fnstcw {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; SSE2_64_WIN-NEXT: movw $3199, {{[0-9]+}}(%rsp) # imm = 0xC7F -; SSE2_64_WIN-NEXT: fldcw {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: fld %st(1) -; SSE2_64_WIN-NEXT: fistpll {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: fldcw {{[0-9]+}}(%rsp) -; SSE2_64_WIN-NEXT: fucompi %st(1) -; SSE2_64_WIN-NEXT: fstp %st(0) -; SSE2_64_WIN-NEXT: jbe .LBB4_1 -; SSE2_64_WIN-NEXT: # %bb.2: -; SSE2_64_WIN-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2_64_WIN-NEXT: addq $24, %rsp -; SSE2_64_WIN-NEXT: retq -; SSE2_64_WIN-NEXT: .LBB4_1: -; SSE2_64_WIN-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE2_64_WIN-NEXT: setbe %al +; SSE2_64_WIN-NEXT: shlq $63, %rax ; SSE2_64_WIN-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; SSE2_64_WIN-NEXT: addq $24, %rsp +; SSE2_64_WIN-NEXT: addq $16, %rsp ; SSE2_64_WIN-NEXT: retq ; ; SSE2_64_LIN-LABEL: x_to_u64: @@ -1285,29 +1268,20 @@ ; SSE2_64_LIN-NEXT: flds {{.*}}(%rip) ; SSE2_64_LIN-NEXT: fld %st(1) ; SSE2_64_LIN-NEXT: fsub %st(1) +; SSE2_64_LIN-NEXT: xorl %eax, %eax +; SSE2_64_LIN-NEXT: fxch %st(1) +; SSE2_64_LIN-NEXT: fucompi %st(2) +; SSE2_64_LIN-NEXT: fcmovnbe %st(1), %st(0) +; SSE2_64_LIN-NEXT: fstp %st(1) ; SSE2_64_LIN-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2_64_LIN-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx ; SSE2_64_LIN-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F ; SSE2_64_LIN-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; SSE2_64_LIN-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; SSE2_64_LIN-NEXT: fistpll -{{[0-9]+}}(%rsp) ; SSE2_64_LIN-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; SSE2_64_LIN-NEXT: movw $3199, -{{[0-9]+}}(%rsp) # imm = 0xC7F -; SSE2_64_LIN-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: fld %st(1) -; SSE2_64_LIN-NEXT: fistpll -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE2_64_LIN-NEXT: fucompi %st(1) -; SSE2_64_LIN-NEXT: fstp %st(0) -; SSE2_64_LIN-NEXT: jbe .LBB4_1 -; SSE2_64_LIN-NEXT: # %bb.2: -; SSE2_64_LIN-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE2_64_LIN-NEXT: retq -; SSE2_64_LIN-NEXT: .LBB4_1: -; SSE2_64_LIN-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; SSE2_64_LIN-NEXT: setbe %al +; SSE2_64_LIN-NEXT: shlq $63, %rax ; SSE2_64_LIN-NEXT: xorq -{{[0-9]+}}(%rsp), %rax ; SSE2_64_LIN-NEXT: retq ;