Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -228,6 +228,8 @@ ArrayRef Src1Regs, ArrayRef Src2Regs, LLT NarrowTy); + void changeOpcode(MachineInstr &MI, unsigned NewOpcode); + public: /// Return the alignment to use for a stack temporary object with the given /// type. Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -667,6 +667,15 @@ Types2); } + /// The instruction is emitted as a library call. + LegalizeRuleSet &libcall() { + using namespace LegalizeMutations; + // We have no choice but conservatively assume that predicate-less lowering + // properly handles all type indices by design: + markAllIdxsAsCovered(); + return actionIf(LegalizeAction::Libcall, always); + } + /// Like legalIf, but for the Libcall action. LegalizeRuleSet &libcallIf(LegalityPredicate Predicate) { // We have no choice but conservatively assume that a libcall with a Index: llvm/include/llvm/Support/TargetOpcodes.def =================================================================== --- llvm/include/llvm/Support/TargetOpcodes.def +++ llvm/include/llvm/Support/TargetOpcodes.def @@ -297,6 +297,9 @@ /// INTRINSIC round to integer intrinsic. HANDLE_TARGET_OPCODE(G_INTRINSIC_LRINT) +/// INTRINSIC roundeven intrinsic. +HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN) + /// INTRINSIC readcyclecounter HANDLE_TARGET_OPCODE(G_READCYCLECOUNTER) Index: llvm/include/llvm/Target/GenericOpcodes.td =================================================================== --- llvm/include/llvm/Target/GenericOpcodes.td +++ llvm/include/llvm/Target/GenericOpcodes.td @@ -918,6 +918,12 @@ let hasSideEffects = 0; } +def G_INTRINSIC_ROUNDEVEN : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1); + let hasSideEffects = 0; +} + def G_READCYCLECOUNTER : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins); Index: llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1280,6 +1280,8 @@ return TargetOpcode::G_FRINT; case Intrinsic::round: return TargetOpcode::G_INTRINSIC_ROUND; + case Intrinsic::roundeven: + return TargetOpcode::G_INTRINSIC_ROUNDEVEN; case Intrinsic::sin: return TargetOpcode::G_FSIN; case Intrinsic::sqrt: Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -77,6 +77,8 @@ return Type::getFloatTy(Ctx); case 64: return Type::getDoubleTy(Ctx); + case 80: + return Type::getX86_FP80Ty(Ctx); case 128: return Type::getFP128Ty(Ctx); default: @@ -386,7 +388,7 @@ } static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { -#define RTLIBCASE(LibcallPrefix) \ +#define RTLIBCASE_INT(LibcallPrefix) \ do { \ switch (Size) { \ case 32: \ @@ -400,19 +402,33 @@ } \ } while (0) - assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size"); +#define RTLIBCASE(LibcallPrefix) \ + do { \ + switch (Size) { \ + case 32: \ + return RTLIB::LibcallPrefix##32; \ + case 64: \ + return RTLIB::LibcallPrefix##64; \ + case 80: \ + return RTLIB::LibcallPrefix##80; \ + case 128: \ + return RTLIB::LibcallPrefix##128; \ + default: \ + llvm_unreachable("unexpected size"); \ + } \ + } while (0) switch (Opcode) { case TargetOpcode::G_SDIV: - RTLIBCASE(SDIV_I); + RTLIBCASE_INT(SDIV_I); case TargetOpcode::G_UDIV: - RTLIBCASE(UDIV_I); + RTLIBCASE_INT(UDIV_I); case TargetOpcode::G_SREM: - RTLIBCASE(SREM_I); + RTLIBCASE_INT(SREM_I); case TargetOpcode::G_UREM: - RTLIBCASE(UREM_I); + RTLIBCASE_INT(UREM_I); case TargetOpcode::G_CTLZ_ZERO_UNDEF: - RTLIBCASE(CTLZ_I); + RTLIBCASE_INT(CTLZ_I); case TargetOpcode::G_FADD: RTLIBCASE(ADD_F); case TargetOpcode::G_FSUB: @@ -455,6 +471,8 @@ RTLIBCASE(RINT_F); case TargetOpcode::G_FNEARBYINT: RTLIBCASE(NEARBYINT_F); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: + RTLIBCASE(ROUNDEVEN_F); } llvm_unreachable("Unknown libcall function"); } @@ -670,10 +688,11 @@ case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FSQRT: case TargetOpcode::G_FRINT: - case TargetOpcode::G_FNEARBYINT: { + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); - if (!HLTy || (Size != 32 && Size != 64 && Size != 128)) { - LLVM_DEBUG(dbgs() << "No libcall available for size " << Size << ".\n"); + if (!HLTy) { + LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n"); return UnableToLegalize; } auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy); @@ -2163,6 +2182,7 @@ case TargetOpcode::G_FPOW: case TargetOpcode::G_INTRINSIC_TRUNC: case TargetOpcode::G_INTRINSIC_ROUND: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: assert(TypeIdx == 0); Observer.changingInstr(MI); @@ -2363,6 +2383,13 @@ } } +// Legalize an instruction by changing the opcode in place. +void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) { + Observer.changingInstr(MI); + MI.setDesc(MIRBuilder.getTII().get(NewOpcode)); + Observer.changedInstr(MI); +} + LegalizerHelper::LegalizeResult LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { using namespace TargetOpcode; @@ -2461,6 +2488,12 @@ return lowerFFloor(MI); case TargetOpcode::G_INTRINSIC_ROUND: return lowerIntrinsicRound(MI); + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: { + // Since round even is the assumed rounding mode for unconstrained FP + // operations, rint and roundeven are the same operation. + changeOpcode(MI, TargetOpcode::G_FRINT); + return Legalized; + } case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { Register OldValRes = MI.getOperand(0).getReg(); Register SuccessRes = MI.getOperand(1).getReg(); @@ -3574,6 +3607,7 @@ case G_FFLOOR: case G_FRINT: case G_INTRINSIC_ROUND: + case G_INTRINSIC_ROUNDEVEN: case G_INTRINSIC_TRUNC: case G_FCOS: case G_FSIN: Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -721,7 +721,8 @@ .scalarize(0) .lower(); - getActionDefinitionsBuilder(G_INTRINSIC_ROUND) + // Lower roundeven into G_FRINT + getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) .scalarize(0) .lower(); Index: llvm/lib/Target/X86/X86LegalizerInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -70,6 +70,11 @@ setLegalizerInfoAVX512DQ(); setLegalizerInfoAVX512BW(); + getActionDefinitionsBuilder(G_INTRINSIC_ROUNDEVEN) + .scalarize(0) + .minScalar(0, LLT::scalar(32)) + .libcall(); + setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1); for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR}) setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1); Index: llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -132,6 +132,9 @@ # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined Index: llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -0,0 +1,566 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define float @v_roundeven_f32(float %x) { +; GFX6-LABEL: v_roundeven_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call float @llvm.roundeven.f32(float %x) + ret float %roundeven +} + +define <2 x float> @v_roundeven_v2f32(<2 x float> %x) { +; GFX6-LABEL: v_roundeven_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_rndne_f32_e32 v1, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_rndne_f32_e32 v1, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x) + ret <2 x float> %roundeven +} + +define <3 x float> @v_roundeven_v3f32(<3 x float> %x) { +; GFX6-LABEL: v_roundeven_v3f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v3f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v3f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_rndne_f32_e32 v1, v1 +; GFX8-NEXT: v_rndne_f32_e32 v2, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_rndne_f32_e32 v1, v1 +; GFX9-NEXT: v_rndne_f32_e32 v2, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x) + ret <3 x float> %roundeven +} + +define <4 x float> @v_roundeven_v4f32(<4 x float> %x) { +; GFX6-LABEL: v_roundeven_v4f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v3, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v4f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v3, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v4f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e32 v0, v0 +; GFX8-NEXT: v_rndne_f32_e32 v1, v1 +; GFX8-NEXT: v_rndne_f32_e32 v2, v2 +; GFX8-NEXT: v_rndne_f32_e32 v3, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e32 v0, v0 +; GFX9-NEXT: v_rndne_f32_e32 v1, v1 +; GFX9-NEXT: v_rndne_f32_e32 v2, v2 +; GFX9-NEXT: v_rndne_f32_e32 v3, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) + ret <4 x float> %roundeven +} + +define half @v_roundeven_f16(half %x) { +; GFX6-LABEL: v_roundeven_f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f16_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call half @llvm.roundeven.f16(half %x) + ret half %roundeven +} + +define <2 x half> @v_roundeven_v2f16(<2 x half> %x) { +; GFX6-LABEL: v_roundeven_v2f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f16_e32 v1, v0 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x) + ret <2 x half> %roundeven +} + +define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) { +; GFX6-LABEL: v_roundeven_v2f16_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_rndne_f32_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f16_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_rndne_f32_e32 v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f16_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_rndne_f16_e32 v1, v0 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f16_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX9-NEXT: v_rndne_f16_e32 v1, v0 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %x.fneg = fneg <2 x half> %x + %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg) + ret <2 x half> %roundeven +} + +define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { +; GFX6-LABEL: v_roundeven_v4f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_rndne_f32_e32 v0, v0 +; GFX6-NEXT: v_rndne_f32_e32 v1, v1 +; GFX6-NEXT: v_rndne_f32_e32 v2, v2 +; GFX6-NEXT: v_rndne_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v4f16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_rndne_f32_e32 v0, v0 +; GFX7-NEXT: v_rndne_f32_e32 v1, v1 +; GFX7-NEXT: v_rndne_f32_e32 v2, v2 +; GFX7-NEXT: v_rndne_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f16_e32 v2, v0 +; GFX8-NEXT: v_rndne_f16_e32 v3, v1 +; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f16_e32 v2, v0 +; GFX9-NEXT: v_rndne_f16_e32 v3, v1 +; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) + ret <4 x half> %roundeven +} + + +define float @v_roundeven_f32_fabs(float %x) { +; GFX6-LABEL: v_roundeven_f32_fabs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f32_fabs: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f32_fabs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f32_fabs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %roundeven = call float @llvm.roundeven.f32(float %fabs.x) + ret float %roundeven +} + +define amdgpu_ps float @s_roundeven_f32(float inreg %x) { +; GFX6-LABEL: s_roundeven_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_rndne_f32_e32 v0, s0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: s_roundeven_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_rndne_f32_e32 v0, s0 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_roundeven_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_rndne_f32_e32 v0, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_roundeven_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_rndne_f32_e32 v0, s0 +; GFX9-NEXT: ; return to shader part epilog + %roundeven = call float @llvm.roundeven.f32(float %x) + ret float %roundeven +} + +define float @v_roundeven_f32_fneg(float %x) { +; GFX6-LABEL: v_roundeven_f32_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f32_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f32_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f32_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg float %x + %roundeven = call float @llvm.roundeven.f32(float %neg.x) + ret float %roundeven +} + +define double @v_roundeven_f64(double %x) { +; GFX6-LABEL: v_roundeven_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0x80000000, v1 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_or_b32_e32 v3, 0x43300000, v3 +; GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call double @llvm.roundeven.f64(double %x) + ret double %roundeven +} + +define double @v_roundeven_f64_fneg(double %x) { +; GFX6-LABEL: v_roundeven_f64_fneg: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX6-NEXT: v_and_b32_e32 v4, 0x80000000, v2 +; GFX6-NEXT: v_mov_b32_e32 v3, 0 +; GFX6-NEXT: v_or_b32_e32 v4, 0x43300000, v4 +; GFX6-NEXT: v_add_f64 v[5:6], -v[0:1], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[3:4] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[1:2]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_f64_fneg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_f64_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_f64_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %roundeven = call double @llvm.roundeven.f64(double %neg.x) + ret double %roundeven +} + +define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { +; GFX6-LABEL: v_roundeven_v2f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_brev_b32 s6, 1 +; GFX6-NEXT: s_mov_b32 s7, 0x43300000 +; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 +; GFX6-NEXT: v_mov_b32_e32 v4, 0 +; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] +; GFX6-NEXT: s_mov_b32 s4, -1 +; GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX6-NEXT: v_and_b32_e32 v5, s6, v3 +; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 +; GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] +; GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_roundeven_v2f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX7-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_roundeven_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX8-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_roundeven_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX9-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) + ret <2 x double> %roundeven +} + +declare half @llvm.roundeven.f16(half) #0 +declare <2 x half> @llvm.roundeven.v2f16(<2 x half>) #0 +declare <4 x half> @llvm.roundeven.v4f16(<4 x half>) #0 + +declare float @llvm.roundeven.f32(float) #0 +declare <2 x float> @llvm.roundeven.v2f32(<2 x float>) #0 +declare <3 x float> @llvm.roundeven.v3f32(<3 x float>) #0 +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) #0 + +declare double @llvm.roundeven.f64(double) #0 +declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) #0 + +declare half @llvm.fabs.f16(half) #0 +declare float @llvm.fabs.f32(float) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } Index: llvm/test/CodeGen/X86/GlobalISel/roundeven.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/GlobalISel/roundeven.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=x86_64-linux-gnu < %s | FileCheck %s + +; FIXME: Calling convention lowering fails +; define half @roundeven_f16(half %x) { +; %roundeven = call half @llvm.roundeven.f16(half %x) +; ret half %roundeven +; } + +define float @roundeven_f32(float %x) { +; CHECK-LABEL: roundeven_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq roundevenf +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %roundeven = call float @llvm.roundeven.f32(float %x) + ret float %roundeven +} + +define double @roundeven_f64(double %x) { +; CHECK-LABEL: roundeven_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq roundeven +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %roundeven = call double @llvm.roundeven.f64(double %x) + ret double %roundeven +} + +; FIXME: Insert fails +; define x86_fp80 @roundeven_fp80(x86_fp80 %x) { +; %roundeven = call x86_fp80 @llvm.roundeven.f80(x86_fp80 %x) +; ret x86_fp80 %roundeven +; } + +define fp128 @roundeven_f128(fp128 %x) { +; CHECK-LABEL: roundeven_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq roundevenl +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %roundeven = call fp128 @llvm.roundeven.f128(fp128 %x) + ret fp128 %roundeven +} + +; FIXME: Fails on build_vector +; define <4 x float> @roundeven_v4f32(<4 x float> %x) { +; %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) +; ret <4 x float> %roundeven +; } + +declare half @llvm.roundeven.f16(half) #0 +declare float @llvm.roundeven.f32(float) #0 +declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) #0 +declare double @llvm.roundeven.f64(double) #0 +declare x86_fp80 @llvm.roundeven.f80(x86_fp80) #0 +declare fp128 @llvm.roundeven.f128(fp128) #0 + +attributes #0 = { nounwind readnone speculatable willreturn }