diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -312,7 +312,7 @@ (instrs BCLR, BCLRn, BDNZLR, BDNZLR8, BDNZLRm, BDNZLRp, BDZLR, BDZLR8, BDZLRm, BDZLRp, gBCLR, BCLRL, BCLRLn, BDNZLRL, BDNZLRLm, BDNZLRLp, BDZLRL, BDZLRLm, BDZLRLp, gBCLRL, - BL, BL8, BL8_NOP, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_TLS, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_TLS + BL, BL8, BL8_NOP, BL8_NOP_RM, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_RM, BL8_NOTOC_TLS, BL8_RM, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_NOP_RM, BL_RM, BL_TLS )>; // 2 Cycles Branch operations, 1 input operands @@ -320,9 +320,9 @@ (instrs B, BCC, BCCA, BCCCTR, BCCCTR8, BCCCTRL, BCCCTRL8, BCCL, BCCLA, BCCLR, BCCLRL, CTRL_DEP, TAILB, TAILB8, BA, TAILBA, TAILBA8, - BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL_LWZinto_toc, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat, + BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL8_LDinto_toc_RM, BCTRL8_RM, BCTRL_LWZinto_toc, BCTRL_LWZinto_toc_RM, BCTRL_RM, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat, BCL, BCLalways, BCLn, BDNZL, BDNZLm, BDNZLp, BDZL, BDZLm, BDZLp, gBCL, gBCLat, - BLA, BLA8, BLA8_NOP + BLA, BLA8, BLA8_NOP, BLA8_NOP_RM, BLA8_RM, BLA_RM )>; // 2 Cycles Branch operations, 3 input operands diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -1302,15 +1302,15 @@ (instregex "BCCTR(L)?(8)?(n)?$"), (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"), (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"), - (instregex "BL(_TLS|_NOP)?$"), - (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"), - (instregex "BLA(8|8_NOP)?$"), + (instregex "BL(_TLS|_NOP)?(_RM)?$"), + (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?(_RM)?$"), + (instregex "BLA(8|8_NOP)?(_RM)?$"), (instregex "BLR(8|L)?$"), (instregex "TAILB(A)?(8)?$"), (instregex "TAILBCTR(8)?$"), (instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"), (instregex "BCLR(L)?(n)?$"), - (instregex "BCTR(L)?(8)?$"), + (instregex "BCTR(L)?(8)?(_RM)?$"), B, BA, BC, @@ -1321,6 +1321,8 @@ BCLn, BCTRL8_LDinto_toc, BCTRL_LWZinto_toc, + BCTRL8_LDinto_toc_RM, + BCTRL_LWZinto_toc_RM, BCn, CTRL_DEP )>; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -200,6 +200,14 @@ /// and 64-bit AIX. BCTRL_LOAD_TOC, + /// The variants that implicitly define rounding mode for calls with + /// strictfp semantics. + CALL_RM, + CALL_NOP_RM, + CALL_NOTOC_RM, + BCTRL_RM, + BCTRL_LOAD_TOC_RM, + /// Return with a flag operand, matched by 'blr' RET_FLAG, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1630,9 +1630,19 @@ case PPCISD::CALL: return "PPCISD::CALL"; case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC"; + case PPCISD::CALL_RM: + return "PPCISD::CALL_RM"; + case PPCISD::CALL_NOP_RM: + return "PPCISD::CALL_NOP_RM"; + case PPCISD::CALL_NOTOC_RM: + return "PPCISD::CALL_NOTOC_RM"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; case PPCISD::BCTRL: return "PPCISD::BCTRL"; case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; + case PPCISD::BCTRL_RM: + return "PPCISD::BCTRL_RM"; + case PPCISD::BCTRL_LOAD_TOC_RM: + return "PPCISD::BCTRL_LOAD_TOC_RM"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; @@ -5172,13 +5182,14 @@ } static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, - const Function &Caller, - const SDValue &Callee, + const Function &Caller, const SDValue &Callee, const PPCSubtarget &Subtarget, - const TargetMachine &TM) { + const TargetMachine &TM, + bool IsStrictFPCall = false) { if (CFlags.IsTailCall) return PPCISD::TC_RETURN; + unsigned RetOpc = 0; // This is a call through a function pointer. if (CFlags.IsIndirect) { // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross @@ -5189,28 +5200,46 @@ // immediately followed by a load of the TOC pointer from the the stack save // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC // as it is not saved or used. - return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC - : PPCISD::BCTRL; - } - - if (Subtarget.isUsingPCRelativeCalls()) { + RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC + : PPCISD::BCTRL; + } else if (Subtarget.isUsingPCRelativeCalls()) { assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI."); - return PPCISD::CALL_NOTOC; + RetOpc = PPCISD::CALL_NOTOC; + } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) + // The ABIs that maintain a TOC pointer accross calls need to have a nop + // immediately following the call instruction if the caller and callee may + // have different TOC bases. At link time if the linker determines the calls + // may not share a TOC base, the call is redirected to a trampoline inserted + // by the linker. The trampoline will (among other things) save the callers + // TOC pointer at an ABI designated offset in the linkage area and the + // linker will rewrite the nop to be a load of the TOC pointer from the + // linkage area into gpr2. + RetOpc = callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL + : PPCISD::CALL_NOP; + else + RetOpc = PPCISD::CALL; + if (IsStrictFPCall) { + switch (RetOpc) { + default: + llvm_unreachable("Unknown call opcode"); + case PPCISD::BCTRL_LOAD_TOC: + RetOpc = PPCISD::BCTRL_LOAD_TOC_RM; + break; + case PPCISD::BCTRL: + RetOpc = PPCISD::BCTRL_RM; + break; + case PPCISD::CALL_NOTOC: + RetOpc = PPCISD::CALL_NOTOC_RM; + break; + case PPCISD::CALL: + RetOpc = PPCISD::CALL_RM; + break; + case PPCISD::CALL_NOP: + RetOpc = PPCISD::CALL_NOP_RM; + break; + } } - - // The ABIs that maintain a TOC pointer accross calls need to have a nop - // immediately following the call instruction if the caller and callee may - // have different TOC bases. At link time if the linker determines the calls - // may not share a TOC base, the call is redirected to a trampoline inserted - // by the linker. The trampoline will (among other things) save the callers - // TOC pointer at an ABI designated offset in the linkage area and the linker - // will rewrite the nop to be a load of the TOC pointer from the linkage area - // into gpr2. - if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) - return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL - : PPCISD::CALL_NOP; - - return PPCISD::CALL; + return RetOpc; } static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, @@ -5506,7 +5535,7 @@ unsigned CallOpc = getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee, - Subtarget, DAG.getTarget()); + Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false); if (!CFlags.IsIndirect) Callee = transformCallee(Callee, DAG, dl, Subtarget); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -178,6 +178,39 @@ } } +let isCall = 1, PPC970_Unit = 7, Defs = [LR8, RM], hasSideEffects = 0, + isCodeGenOnly = 1, Uses = [RM] in { + // Convenient aliases for call instructions + def BL8_RM : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + + def BLA8_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall_rm (i64 imm:$func))]>; + def BL8_NOP_RM : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; + + def BLA8_NOP_RM : IForm_and_DForm_4_zero<18, 1, 1, 24, + (outs), (ins abscalltarget:$func), + "bla $func\n\tnop", IIC_BrB, + [(PPCcall_nop_rm (i64 imm:$func))]>; + let Predicates = [PCRelativeMemops] in { + // BL8_NOTOC means that the caller does not use the TOC pointer and if + // it does use R2 then it is just a caller saved register. Therefore it is + // safe to emit only the bl and not the nop for this instruction. The + // linker will not try to restore R2 after the call. + def BL8_NOTOC_RM : IForm<18, 0, 1, (outs), + (ins calltarget:$func), + "bl $func", IIC_BrB, []>; + } + let Uses = [CTR8, RM] in { + let isPredicable = 1 in + def BCTRL8_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl_rm)]>, + Requires<[In64BitMode]>; + } +} + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in { def BCTRL8_LDinto_toc : @@ -188,6 +221,16 @@ Requires<[In64BitMode]>; } +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR8, X2, RM], Uses = [CTR8, RM], RST = 2 in { + def BCTRL8_LDinto_toc_RM : + XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs), + (ins memrix:$src), + "bctrl\n\tld 2, $src", IIC_BrB, + [(PPCbctrl_load_toc_rm iaddrX4:$src)]>, + Requires<[In64BitMode]>; +} + } // Interpretation64Bit // FIXME: Duplicating this for the asm parser should be unnecessary, but the @@ -214,12 +257,32 @@ def : Pat<(PPCcall_notoc (i64 texternalsym:$dst)), (BL8_NOTOC texternalsym:$dst)>; +def : Pat<(PPCcall_rm (i64 tglobaladdr:$dst)), + (BL8_RM tglobaladdr:$dst)>; +def : Pat<(PPCcall_nop_rm (i64 tglobaladdr:$dst)), + (BL8_NOP_RM tglobaladdr:$dst)>; + +def : Pat<(PPCcall_rm (i64 texternalsym:$dst)), + (BL8_RM texternalsym:$dst)>; +def : Pat<(PPCcall_nop_rm (i64 texternalsym:$dst)), + (BL8_NOP_RM texternalsym:$dst)>; + +def : Pat<(PPCcall_notoc_rm (i64 tglobaladdr:$dst)), + (BL8_NOTOC_RM tglobaladdr:$dst)>; +def : Pat<(PPCcall_notoc_rm (i64 texternalsym:$dst)), + (BL8_NOTOC_RM texternalsym:$dst)>; + // Calls for AIX def : Pat<(PPCcall (i64 mcsym:$dst)), (BL8 mcsym:$dst)>; def : Pat<(PPCcall_nop (i64 mcsym:$dst)), (BL8_NOP mcsym:$dst)>; +def : Pat<(PPCcall_rm (i64 mcsym:$dst)), + (BL8_RM mcsym:$dst)>; +def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)), + (BL8_NOP_RM mcsym:$dst)>; + // Atomic operations // FIXME: some of these might be used with constant operands. This will result // in constant materialization instructions that may be redundant. We currently diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2246,11 +2246,13 @@ return true; } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL || - OpC == PPC::BCTRL8) { + OpC == PPC::BCTRL8 || OpC == PPC::BCTRL_RM || + OpC == PPC::BCTRL8_RM) { if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) llvm_unreachable("Cannot predicate bctr[l] on the ctr register"); - bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8; + bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8 || + OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM; bool isPPC64 = Subtarget.isPPC64(); if (Pred[0].getImm() == PPC::PRED_BIT_SET) { @@ -2274,6 +2276,9 @@ MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit) .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine); + if (OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM) + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(PPC::RM, RegState::ImplicitDefine); return true; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -316,6 +316,24 @@ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +// Call nodes for strictfp calls (that define RM). +def PPCcall_rm : SDNode<"PPCISD::CALL_RM", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCcall_nop_rm : SDNode<"PPCISD::CALL_NOP_RM", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCcall_notoc_rm : SDNode<"PPCISD::CALL_NOTOC_RM", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCbctrl_rm : SDNode<"PPCISD::BCTRL_RM", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM", + SDTypeProfile<0, 1, []>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -1892,6 +1910,26 @@ } } +let isCall = 1, PPC970_Unit = 7, Defs = [LR, RM], isCodeGenOnly = 1 in { + // Convenient aliases for call instructions + let Uses = [RM] in { + def BL_RM : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + def BLA_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall_rm (i32 imm:$func))]>; + + def BL_NOP_RM : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; + } + let Uses = [CTR, RM] in { + let isPredicable = 1 in + def BCTRL_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl_rm)]>, + Requires<[In32BitMode]>; + } +} + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in def TCRETURNdi :PPCEmitTimePseudo< (outs), (ins calltarget:$dst, i32imm:$offset), @@ -1918,6 +1956,14 @@ } +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR, R2, RM], Uses = [CTR, RM], RST = 2 in { + def BCTRL_LWZinto_toc_RM: + XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs), + (ins memri:$src), "bctrl\n\tlwz 2, $src", IIC_BrB, + [(PPCbctrl_load_toc_rm iaddr:$src)]>, Requires<[In32BitMode]>; + +} let isCodeGenOnly = 1, hasSideEffects = 0 in { @@ -3435,6 +3481,12 @@ def : Pat<(PPCcall (i32 texternalsym:$dst)), (BL texternalsym:$dst)>; +def : Pat<(PPCcall_rm (i32 tglobaladdr:$dst)), + (BL_RM tglobaladdr:$dst)>; + +def : Pat<(PPCcall_rm (i32 texternalsym:$dst)), + (BL_RM texternalsym:$dst)>; + // Calls for AIX only def : Pat<(PPCcall (i32 mcsym:$dst)), (BL mcsym:$dst)>; @@ -3445,6 +3497,15 @@ def : Pat<(PPCcall_nop (i32 texternalsym:$dst)), (BL_NOP texternalsym:$dst)>; +def : Pat<(PPCcall_rm (i32 mcsym:$dst)), + (BL_RM mcsym:$dst)>; + +def : Pat<(PPCcall_nop_rm (i32 mcsym:$dst)), + (BL_NOP_RM mcsym:$dst)>; + +def : Pat<(PPCcall_nop_rm (i32 texternalsym:$dst)), + (BL_NOP_RM texternalsym:$dst)>; + def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -906,16 +906,13 @@ // Rounding Instructions respecting current rounding mode def XSRDPIC : XX2Form<60, 107, (outs vsfrc:$XT), (ins vsfrc:$XB), - "xsrdpic $XT, $XB", IIC_VecFP, - [(set f64:$XT, (fnearbyint f64:$XB))]>; + "xsrdpic $XT, $XB", IIC_VecFP, []>; def XVRDPIC : XX2Form<60, 235, (outs vsrc:$XT), (ins vsrc:$XB), - "xvrdpic $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; + "xvrdpic $XT, $XB", IIC_VecFP, []>; def XVRSPIC : XX2Form<60, 171, (outs vsrc:$XT), (ins vsrc:$XB), - "xvrspic $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; + "xvrspic $XT, $XB", IIC_VecFP, []>; // Max/Min Instructions let isCommutable = 1 in { def XSMAXDP : XX3Form<60, 160, @@ -2783,9 +2780,6 @@ def : Pat<(f32 (any_fround f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f32 (fnearbyint f32:$S)), - (f32 (COPY_TO_REGCLASS (XSRDPIC - (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; def : Pat<(f32 (any_ffloor f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIM (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; @@ -2804,6 +2798,19 @@ def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>; def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; +// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour, +// these need to be defined after the any_frint versions so ISEL will correctly +// add the chain to the strict versions. +def : Pat<(f32 (fnearbyint f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPIC + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; +def : Pat<(f64 (fnearbyint f64:$S)), + (f64 (XSRDPIC $S))>; +def : Pat<(v2f64 (fnearbyint v2f64:$S)), + (v2f64 (XVRDPIC $S))>; +def : Pat<(v4f32 (fnearbyint v4f32:$S)), + (v4f32 (XVRSPIC $S))>; + // Materialize a zero-vector of long long def : Pat<(v2i64 immAllZerosV), (v2i64 (XXLXORz))>; diff --git a/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll b/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll @@ -0,0 +1,127 @@ +; The non-strictfp version of test/CodeGen/PowerPC/respect-rounding-mode.ll +; Without strictfp, CSE should be free to eliminate the repeated multiply +; and conversion instructions. +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 + +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8 + +define dso_local signext i32 @func1() local_unnamed_addr #0 { +entry: + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %0, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext3 = extractelement <2 x double> %1, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +declare void @directCall(...) local_unnamed_addr + +declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) + +declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) + +declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata) + +declare void @exit(i32 signext) local_unnamed_addr + +define dso_local signext i32 @func2() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @getvector1(...) local_unnamed_addr + +declare <2 x double> @getvector2(...) local_unnamed_addr + +declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) + +declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>) + +define dso_local signext i32 @func3() local_unnamed_addr #0 { +entry: + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %1, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + %2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %2() #0 + %3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext4 = extractelement <2 x double> %3, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +define dso_local signext i32 @func4() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + %1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %1() #0 + %mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata) + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll b/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll @@ -0,0 +1,128 @@ +; The strictfp version of test/CodeGen/PowerPC/cse-despit-rounding-mode.ll +; With strictfp, the MachineIR optimizations need to assume that a call +; can change the rounding mode and must not move/eliminate the repeated +; multiply/convert instructions in this test. +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 + +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 +@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8 + +define dso_local signext i32 @func1() local_unnamed_addr #0 { +entry: + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %0, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext3 = extractelement <2 x double> %1, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +declare void @directCall(...) local_unnamed_addr + +declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) + +declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) + +declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata) + +declare void @exit(i32 signext) local_unnamed_addr + +define dso_local signext i32 @func2() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @getvector1(...) local_unnamed_addr + +declare <2 x double> @getvector2(...) local_unnamed_addr + +declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) + +declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>) + +define dso_local signext i32 @func3() local_unnamed_addr #0 { +entry: + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %1, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + %2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %2() #0 + %3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext4 = extractelement <2 x double> %3, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +define dso_local signext i32 @func4() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + %1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %1() #0 + %mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata) + +attributes #0 = { nounwind strictfp } diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -4631,14 +4631,14 @@ define <4 x double> @constrained_vector_rint_v4f64(<4 x double> %x) #0 { ; PC64LE-LABEL: constrained_vector_rint_v4f64: ; PC64LE: # %bb.0: # %entry -; PC64LE-NEXT: xvrdpic 34, 34 ; PC64LE-NEXT: xvrdpic 35, 35 +; PC64LE-NEXT: xvrdpic 34, 34 ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: constrained_vector_rint_v4f64: ; PC64LE9: # %bb.0: # %entry -; PC64LE9-NEXT: xvrdpic 34, 34 ; PC64LE9-NEXT: xvrdpic 35, 35 +; PC64LE9-NEXT: xvrdpic 34, 34 ; PC64LE9-NEXT: blr entry: %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(