This is an archive of the discontinued LLVM Phabricator instance.

[X86] Custom lower ISD::FROUND with SSE4.1 to avoid a libcall.
ClosedPublic

Authored by craig.topper on Jan 29 2020, 12:01 AM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon
andrew.w.kaylor
efriedma
uweigand
kpn
cameron.mcinally

Commits

rG90c31b0f428f: [X86] Custom lower ISD::FROUND with SSE4.1 to avoid a libcall.

Summary

ISD::FROUND is defined to round to nearest with ties rounding
away from 0. This mode isn't supported in hardware on X86.

But as long as we aren't compiling with trapping math, we can
emulate this with floor(X + copysign(nextafter(0.5, 0.0), X)).

We have to use nextafter to avoid some corner cases that adding
0.5 would have. For example, if X is nextafter(0.5, 0.0) it should
round to 0.0, but adding 0.5 would need one extra bit of mantissa
than can be stored so it rounds to 1.0. Adding nextafter(0.5, 0.0)
instead will just increase the exponent by 1 and leave the mantissa
as all 1s. This would be nextafter(1.0, 0.0) which will floor to 0.0.

Techically this requires -fno-trapping-math which isn't our default.
But if we care about exceptions we should be using constrained
intrinsics. Constrained intrinsics would use STRICT_FROUND which
won't go through this code.

Fixes PR42195.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

craig.topper created this revision.Jan 29 2020, 12:01 AM

Herald added a project: Restricted Project. · View Herald TranscriptJan 29 2020, 12:01 AM

Herald added a subscriber: hiraditya. · View Herald Transcript

craig.topper added reviewers: andrew.w.kaylor, efriedma, uweigand, kpn, cameron.mcinally.Jan 29 2020, 12:02 AM

Harbormaster completed remote builds in B45208: Diff 241058.Jan 29 2020, 12:05 AM

LGTM

llvm/lib/Target/X86/X86ISelLowering.cpp
20460	Include at least part of the text from this patch description as a block comment for this function, so: /// ISD::FROUND is defined to round to nearest with ties rounding /// away from 0. This mode isn't supported in hardware on X86. /// But as long as we aren't compiling with trapping math, we can /// emulate this with floor(X + copysign(nextafter(0.5, 0.0), X)). /// ...
llvm/test/CodeGen/X86/vec_round.ll
0	This test/file was added with: rL188048 ...to show we wouldn't crash? But it doesn't add value now that we have more thorough tests. I'd delete it.

This revision is now accepted and ready to land.Jan 29 2020, 6:53 AM

Closed by commit rG90c31b0f428f: [X86] Custom lower ISD::FROUND with SSE4.1 to avoid a libcall. (authored by craig.topper). · Explain WhyJan 29 2020, 9:12 AM

This revision was automatically updated to reflect the committed changes.

nickdesaulniers added a subscriber: nickdesaulniers.Apr 28 2020, 5:00 PM

I ran across a llvm_unreachable that points to this commit. Repro instructions below:

test.ii

double compare1(double x, double y) { return ((int)x< y) ? x : y; }
double compare2(double x, double y) { return y != 0.0 ? y : x; }

int compareint(int x, int y, int z) { return (x < y) ? y : (z < x) ? z : x; }

class C {
public:
  C(double arg) { 
    constexpr int k1 = -(1 << 23); 
    constexpr int k2 = (1 << 23) - 1;
    array[0] = compareint(arg, k1, k2);
  }
  
  char array[3];
};
extern "C" double round(double);
constexpr double kEight = 8;

C create(int b) {
  double d1 = b * kEight;
  double d2 = round(d1);
  double d3 = compare1(0.0, d2);
  double d4 = compare2(0.0, d3);
  return C(d4);
}

void loop(int* b, C *ptr, long j) {
  for (int i = 0; i < j; ++i)
    ptr[i] = create(b[i]);
}

clang command:

clang \
"-cc1" \
"-triple" "x86_64-unknown-linux-gnu" \
"-emit-obj" \
"-target-cpu" "x86-64" \
"-target-feature" "+avx" \
"-target-feature" "+avx2" \
"-target-feature" "+avx512f" \
"-O3" \
"-vectorize-loops" \
"-x" "c++" "test.ii"

output:

PromoteIntegerResult #0: t177: v16i24 = X86ISD::VTRUNCS t216

Do not know how to promote this operator!
UNREACHABLE executed at llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp:54!

0.	Program arguments: clang -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -target-cpu x86-64 -target-feature +avx -target-feature +avx2 -target-feature +avx512f -O3 -vectorize-loops -x c++ test.ii
1.	<eof> parser at end of file
2.	Code generation
3.	Running pass 'Function Pass Manager' on module 'test.ii'.
4.	Running pass 'X86 DAG->DAG Instruction Selection' on function '@_Z4loopPiP1Cl'

Herald added a subscriber: pengfei. · View Herald TranscriptJun 8 2021, 7:07 PM

Looks like some combine is incorrectly producing X86ISD::VTRUNCS? Probably just exposed by this patch.

In D73607#2806917, @rtrieu wrote:

I ran across a llvm_unreachable that points to this commit. Repro instructions below:

test.ii

double compare1(double x, double y) { return ((int)x< y) ? x : y; }
double compare2(double x, double y) { return y != 0.0 ? y : x; }

int compareint(int x, int y, int z) { return (x < y) ? y : (z < x) ? z : x; }

class C {
public:
  C(double arg) { 
    constexpr int k1 = -(1 << 23); 
    constexpr int k2 = (1 << 23) - 1;
    array[0] = compareint(arg, k1, k2);
  }
  
  char array[3];
};
extern "C" double round(double);
constexpr double kEight = 8;

C create(int b) {
  double d1 = b * kEight;
  double d2 = round(d1);
  double d3 = compare1(0.0, d2);
  double d4 = compare2(0.0, d3);
  return C(d4);
}

void loop(int* b, C *ptr, long j) {
  for (int i = 0; i < j; ++i)
    ptr[i] = create(b[i]);
}

clang command:

clang \
"-cc1" \
"-triple" "x86_64-unknown-linux-gnu" \
"-emit-obj" \
"-target-cpu" "x86-64" \
"-target-feature" "+avx" \
"-target-feature" "+avx2" \
"-target-feature" "+avx512f" \
"-O3" \
"-vectorize-loops" \
"-x" "c++" "test.ii"

output:

PromoteIntegerResult #0: t177: v16i24 = X86ISD::VTRUNCS t216

Do not know how to promote this operator!
UNREACHABLE executed at llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp:54!

0.	Program arguments: clang -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -target-cpu x86-64 -target-feature +avx -target-feature +avx2 -target-feature +avx512f -O3 -vectorize-loops -x c++ test.ii
1.	<eof> parser at end of file
2.	Code generation
3.	Running pass 'Function Pass Manager' on module 'test.ii'.
4.	Running pass 'X86 DAG->DAG Instruction Selection' on function '@_Z4loopPiP1Cl'

This should fix it

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b89e1674..90babf3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45273,7 +45273,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
-      Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
+      Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
+      (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
     unsigned TruncOpc = 0;
     SDValue SatVal;
     if (auto SSatVal = detectSSatPattern(In, VT)) {

Posted here https://reviews.llvm.org/D103940

craig.topper mentioned this in D103940: [X86] Check destination element type before forming VTRUNCS/VTRUNCUS in combineTruncateWithSat..Jun 8 2021, 10:41 PM

craig.topper mentioned this in rG765ef4bb2af6: [X86] Check destination element type before forming VTRUNCS/VTRUNCUS in….Jun 9 2021, 7:16 AM

Thanks for the quick fix. I've verified that it fixes my full test case.

FreddyYe mentioned this in D110312: [X86][ISel] Lowering FROUND(f16) and FROUNDEVEN(f16).Sep 26 2021, 1:11 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

32 lines

test/

CodeGen/

X86/

extractelement-fp.ll

41 lines

vec-libcalls.ll

14 lines

vec_round.ll

Diff 241191

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,072 Lines • ▼ Show 20 Lines	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FCEIL, RoundedTy, Legal);		setOperationAction(ISD::FCEIL, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);		setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
setOperationAction(ISD::FTRUNC, RoundedTy, Legal);		setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);		setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
setOperationAction(ISD::FRINT, RoundedTy, Legal);		setOperationAction(ISD::FRINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);		setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);		setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);		setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

		setOperationAction(ISD::FROUND, RoundedTy, Custom);
}		}

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);		setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
setOperationAction(ISD::SMAX, MVT::v4i32, Legal);		setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
setOperationAction(ISD::UMAX, MVT::v8i16, Legal);		setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
setOperationAction(ISD::UMAX, MVT::v4i32, Legal);		setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
setOperationAction(ISD::SMIN, MVT::v16i8, Legal);		setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
setOperationAction(ISD::SMIN, MVT::v4i32, Legal);		setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
setOperationAction(ISD::FCEIL, VT, Legal);		setOperationAction(ISD::FCEIL, VT, Legal);
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);		setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);		setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);		setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);		setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);		setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);		setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);		setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

		setOperationAction(ISD::FROUND, VT, Custom);

setOperationAction(ISD::FNEG, VT, Custom);		setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);		setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);		setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}		}

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted		// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.		// even though v8i16 is a legal type.
setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);		setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
▲ Show 20 Lines • Show All 349 Lines • ▼ Show 20 Lines	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::STRICT_FCEIL, VT, Legal);		setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
setOperationAction(ISD::FTRUNC, VT, Legal);		setOperationAction(ISD::FTRUNC, VT, Legal);
setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);		setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
setOperationAction(ISD::FRINT, VT, Legal);		setOperationAction(ISD::FRINT, VT, Legal);
setOperationAction(ISD::STRICT_FRINT, VT, Legal);		setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);		setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);		setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

		setOperationAction(ISD::FROUND, VT, Custom);

setOperationAction(ISD::SELECT, VT, Custom);		setOperationAction(ISD::SELECT, VT, Custom);
}		}

// Without BWI we need to use custom lowering to handle MVT::v64i8 input.		// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {		for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);		setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);		setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}		}
▲ Show 20 Lines • Show All 18,899 Lines • ▼ Show 20 Lines
/// Depending on uarch and/or optimizing for size, we might prefer to use a		/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.		/// vector operation in place of the typical scalar operation.
SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {		SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&		assert((Op.getValueType() == MVT::f32 \|\| Op.getValueType() == MVT::f64) &&
"Only expecting float/double");		"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);		return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
}		}

		/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
		spatelUnsubmitted Not Done Reply Inline Actions Include at least part of the text from this patch description as a block comment for this function, so: /// ISD::FROUND is defined to round to nearest with ties rounding /// away from 0. This mode isn't supported in hardware on X86. /// But as long as we aren't compiling with trapping math, we can /// emulate this with floor(X + copysign(nextafter(0.5, 0.0), X)). /// ... spatel: Include at least part of the text from this patch description as a block comment for this…
		/// This mode isn't supported in hardware on X86. But as long as we aren't
		/// compiling with trapping math, we can emulate this with
		/// floor(X + copysign(nextafter(0.5, 0.0), X)).
		static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
		SDValue N0 = Op.getOperand(0);
		SDLoc dl(Op);
		MVT VT = Op.getSimpleValueType();

		// N0 += copysign(nextafter(0.5, 0.0), N0)
		const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
		bool Ignored;
		APFloat Point5Pred = APFloat(0.5f);
		Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
		Point5Pred.next(/nextDown/true);

		SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
		DAG.getConstantFP(Point5Pred, dl, VT), N0);
		N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

		// Truncate the result to remove fraction.
		return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
		}

/// The only differences between FABS and FNEG are the mask and the logic op.		/// The only differences between FABS and FNEG are the mask and the logic op.
/// FNEG also has a folding opportunity for FNEG(FABS(x)).		/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {		static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&		assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
"Wrong opcode for lowering FABS or FNEG.");		"Wrong opcode for lowering FABS or FNEG.");

bool IsFABS = (Op.getOpcode() == ISD::FABS);		bool IsFABS = (Op.getOpcode() == ISD::FABS);

▲ Show 20 Lines • Show All 8,157 Lines • ▼ Show 20 Lines	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_EXTEND:		case ISD::FP_EXTEND:
case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);		case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FP_ROUND:		case ISD::FP_ROUND:
case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);		case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);		case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);		case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:		case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG);		case ISD::FSUB: return lowerFaddFsub(Op, DAG);
		case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FABS:		case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);		case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);		case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);		case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
case ISD::SETCC:		case ISD::SETCC:
case ISD::STRICT_FSETCC:		case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);		case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);		case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
▲ Show 20 Lines • Show All 18,896 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/extractelement-fp.ll

Show First 20 Lines • Show All 1,061 Lines • ▼ Show 20 Lines	; X86-NEXT: retl
%v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x)		%v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x)
%r = extractelement <4 x double> %v, i32 0		%r = extractelement <4 x double> %v, i32 0
ret double %r		ret double %r
}		}

define float @round_v4f32(<4 x float> %x) nounwind {		define float @round_v4f32(<4 x float> %x) nounwind {
; X64-LABEL: round_v4f32:		; X64-LABEL: round_v4f32:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: jmp roundf # TAILCALL		; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
		; X64-NEXT: vandps %xmm1, %xmm0, %xmm1
		; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
		; X64-NEXT: vorps %xmm1, %xmm2, %xmm1
		; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
		; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
		; X64-NEXT: retq
;		;
; X86-LABEL: round_v4f32:		; X86-LABEL: round_v4f32:
; X86: # %bb.0:		; X86: # %bb.0:
; X86-NEXT: pushl %eax		; X86-NEXT: pushl %eax
		; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
		; X86-NEXT: vandps %xmm1, %xmm0, %xmm1
		; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
		; X86-NEXT: vorps %xmm1, %xmm2, %xmm1
		; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
		; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovss %xmm0, (%esp)		; X86-NEXT: vmovss %xmm0, (%esp)
; X86-NEXT: calll roundf		; X86-NEXT: flds (%esp)
; X86-NEXT: popl %eax		; X86-NEXT: popl %eax
; X86-NEXT: retl		; X86-NEXT: retl
%v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)		%v = call <4 x float> @llvm.round.v4f32(<4 x float> %x)
%r = extractelement <4 x float> %v, i32 0		%r = extractelement <4 x float> %v, i32 0
ret float %r		ret float %r
}		}

define double @round_v4f64(<4 x double> %x) nounwind {		define double @round_v4f64(<4 x double> %x) nounwind {
; X64-LABEL: round_v4f64:		; X64-LABEL: round_v4f64:
; X64: # %bb.0:		; X64: # %bb.0:
; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0		; X64-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm1
		; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
		; X64-NEXT: # xmm2 = mem[0,0]
		; X64-NEXT: vorpd %xmm1, %xmm2, %xmm1
		; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
		; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
; X64-NEXT: vzeroupper		; X64-NEXT: vzeroupper
; X64-NEXT: jmp round # TAILCALL		; X64-NEXT: retq
;		;
; X86-LABEL: round_v4f64:		; X86-LABEL: round_v4f64:
; X86: # %bb.0:		; X86: # %bb.0:
		; X86-NEXT: pushl %ebp
		; X86-NEXT: movl %esp, %ebp
		; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $8, %esp		; X86-NEXT: subl $8, %esp
; X86-NEXT: vmovlps %xmm0, (%esp)		; X86-NEXT: vandpd {{\.LCPI.*}}, %xmm0, %xmm1
		; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1]
		; X86-NEXT: # xmm2 = mem[0,0]
		; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1
		; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
		; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
		; X86-NEXT: vmovsd %xmm0, (%esp)
		; X86-NEXT: fldl (%esp)
		; X86-NEXT: movl %ebp, %esp
		; X86-NEXT: popl %ebp
; X86-NEXT: vzeroupper		; X86-NEXT: vzeroupper
; X86-NEXT: calll round
; X86-NEXT: addl $8, %esp
; X86-NEXT: retl		; X86-NEXT: retl
%v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)		%v = call <4 x double> @llvm.round.v4f64(<4 x double> %x)
%r = extractelement <4 x double> %v, i32 0		%r = extractelement <4 x double> %v, i32 0
ret double %r		ret double %r
}		}

define float @rcp_v4f32(<4 x float> %x) nounwind {		define float @rcp_v4f32(<4 x float> %x) nounwind {
; X64-LABEL: rcp_v4f32:		; X64-LABEL: rcp_v4f32:
▲ Show 20 Lines • Show All 115 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vec-libcalls.ll

	Show First 20 Lines • Show All 380 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%r = call <2 x float> @llvm.rint.v2f32(<2 x float> %x)			%r = call <2 x float> @llvm.rint.v2f32(<2 x float> %x)
	ret <2 x float> %r			ret <2 x float> %r
	}			}

	define <2 x float> @round_v2f32(<2 x float> %x) nounwind {			define <2 x float> @round_v2f32(<2 x float> %x) nounwind {
	; CHECK-LABEL: round_v2f32:			; CHECK-LABEL: round_v2f32:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: subq $40, %rsp			; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1
	; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill			; CHECK-NEXT: vorps {{.*}}(%rip), %xmm1, %xmm1
	; CHECK-NEXT: callq roundf			; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill			; CHECK-NEXT: vroundps $11, %xmm0, %xmm0
	; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
	; CHECK-NEXT: # xmm0 = mem[1,1,3,3]
	; CHECK-NEXT: callq roundf
	; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
	; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
	; CHECK-NEXT: addq $40, %rsp
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%r = call <2 x float> @llvm.round.v2f32(<2 x float> %x)			%r = call <2 x float> @llvm.round.v2f32(<2 x float> %x)
	ret <2 x float> %r			ret <2 x float> %r
	}			}

	define <2 x float> @sqrt_v2f32(<2 x float> %x) nounwind {			define <2 x float> @sqrt_v2f32(<2 x float> %x) nounwind {
	; CHECK-LABEL: sqrt_v2f32:			; CHECK-LABEL: sqrt_v2f32:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	Show All 15 Lines

llvm/test/CodeGen/X86/vec_round.ll

This file was deleted.

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mcpu=nehalem -mtriple=x86_64-unknown-linux-gnu < %s \| FileCheck %s
	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"

	declare void @use(<2 x double>)

	; Function Attrs: nounwind uwtable
	define void @test() {
	; CHECK-LABEL: test:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: pushq %rax
	; CHECK-NEXT: .cfi_def_cfa_offset 16
	; CHECK-NEXT: callq round
	; CHECK-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
	; CHECK-NEXT: callq use
	; CHECK-NEXT: popq %rax
	; CHECK-NEXT: .cfi_def_cfa_offset 8
	; CHECK-NEXT: retq
	entry:
	%tmp = call <2 x double> @llvm.round.v2f64(<2 x double> undef)
	call void @use(<2 x double> %tmp)
	ret void
	}

	; Function Attrs: nounwind readonly
	declare <2 x double> @llvm.round.v2f64(<2 x double>) #0

	attributes #0 = { nounwind readonly }