diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -451,6 +451,10 @@ (sequence "X%u",19, 28), LR, FP)>; +// The SMSTART/SMSTOP instructions preserve only GPR registers. +def CSR_AArch64_SMStartStop : CalleeSavedRegs<(add (sequence "X%u", 0, 28), + LR, FP)>; + def CSR_AArch64_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X20, X22)>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -58,6 +58,9 @@ CALL_BTI, // Function call followed by a BTI instruction. + SMSTART, + SMSTOP, + // Produces the full sequence of instructions for getting the thread pointer // offset of a variable into X0, using the TLSDesc model. TLSDESC_CALLSEQ, @@ -872,6 +875,14 @@ bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; + /// If a change in streaming mode is required on entry to/return from a + /// function call it emits and returns the corresponding SMSTART or SMSTOP node. + /// \p Entry tells whether this is before/after the Call, which is necessary + /// because PSTATE.SM is only queried once. + SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, + SDValue Chain, SDValue InFlag, + SDValue PStateSM, bool Entry) const; + private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2058,6 +2058,8 @@ switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; + MAKE_CASE(AArch64ISD::SMSTART) + MAKE_CASE(AArch64ISD::SMSTOP) MAKE_CASE(AArch64ISD::CALL) MAKE_CASE(AArch64ISD::ADRP) MAKE_CASE(AArch64ISD::ADR) @@ -4517,6 +4519,17 @@ Mask); } +static Optional getCalleeAttrsFromExternalFunction(SDValue V) { + if (auto *ES = dyn_cast(V)) { + StringRef S(ES->getSymbol()); + if (S == "__arm_sme_state" || S == "__arm_tpidr2_save") + return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved); + if (S == "__arm_tpidr2_restore") + return SMEAttrs(SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared); + } + return None; +} + SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -6640,6 +6653,25 @@ return ZExtBool; } +SDValue AArch64TargetLowering::changeStreamingMode( + SelectionDAG &DAG, SDLoc DL, bool Enable, + SDValue Chain, SDValue InFlag, SDValue PStateSM, bool Entry) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()); + SDValue MSROp = + DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32); + + SDValue ExpectedSMVal = + DAG.getTargetConstant(Entry ? Enable : !Enable, DL, MVT::i64); + SmallVector Ops = {Chain, MSROp, PStateSM, ExpectedSMVal, RegMask}; + + if (InFlag) + Ops.push_back(InFlag); + + unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP; + return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops); +} + /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue @@ -6760,6 +6792,19 @@ assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } + // Determine whether we need any streaming mode changes. + SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction()); + if (CLI.CB) + CalleeAttrs = SMEAttrs(*CLI.CB); + else if (Optional Attrs = + getCalleeAttrsFromExternalFunction(CLI.Callee)) + CalleeAttrs = *Attrs; + + SDValue InFlag, PStateSM; + Optional RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs); + if (RequiresSMChange) + PStateSM = getPStateSM(DAG, Chain, CallerAttrs, DL, MVT::i64); + // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) @@ -7011,9 +7056,15 @@ if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + if (RequiresSMChange) { + SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain, + InFlag, PStateSM, true); + Chain = NewChain.getValue(0); + InFlag = NewChain.getValue(1); + } + // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; for (auto &RegToPass : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InFlag); @@ -7143,14 +7194,36 @@ DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, DL); - if (!Ins.empty()) - InFlag = Chain.getValue(1); + InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, - InVals, IsThisReturn, - IsThisReturn ? OutVals[0] : SDValue()); + SDValue Result = + LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, InVals, + IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); + + if (!Ins.empty()) + InFlag = Result.getValue(Result->getNumValues() - 1); + + if (RequiresSMChange) { + assert(PStateSM && "Expected a PStateSM to be set"); + Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InFlag, + PStateSM, false); + for (unsigned I = 0; I < InVals.size(); ++I) { + // The smstart/smstop is chained as part of the call, but when the + // resulting chain is discarded (which happens when the call is not part + // of a chain, e.g. a call to @llvm.cos()), we need to ensure the + // smstart/smstop is chained to the result value. We can do that by doing + // a vreg -> vreg copy. + Register Reg = MF.getRegInfo().createVirtualRegister( + getRegClassFor(InVals[I].getValueType().getSimpleVT())); + SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]); + InVals[I] = DAG.getCopyFromReg(X, DL, Reg, + InVals[I].getValueType()); + } + } + + return Result; } bool AArch64TargetLowering::CanLowerReturn( diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1093,6 +1093,9 @@ case AArch64::ISB: // DSB and ISB also are scheduling barriers. return true; + case AArch64::MSRpstatesvcrImm1: + // SMSTART and SMSTOP are also scheduling barriers. + return true; default:; } if (isSEHInstruction(MI)) diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -68,6 +68,8 @@ // normal calls, so they need a different mask to represent this. const uint32_t *getTLSCallPreservedMask() const; + const uint32_t *getSMStartStopCallPreservedMask() const; + // Funclets on ARM64 Windows don't preserve any registers. const uint32_t *getNoPreservedMask() const override; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -321,6 +321,10 @@ *Mask = UpdatedMask; } +const uint32_t *AArch64RegisterInfo::getSMStartStopCallPreservedMask() const { + return CSR_AArch64_SMStartStop_RegMask; +} + const uint32_t *AArch64RegisterInfo::getNoPreservedMask() const { return CSR_AArch64_NoRegs_RegMask; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1379,7 +1379,9 @@ let DiagnosticType = "Invalid" # Name; } -def svcr_op : Operand { +def svcr_op : Operand, TImmLeaf { let ParserMatchClass = SVCROperand; let PrintMethod = "printSVCROp"; let DecoderMethod = "DecodeSVCROp"; diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -10,6 +10,15 @@ // //===----------------------------------------------------------------------===// +def AArch64_smstart : SDNode<"AArch64ISD::SMSTART", SDTypeProfile<0, 3, + [SDTCisInt<0>, SDTCisInt<0>, SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect, SDNPVariadic, + SDNPOptInGlue, SDNPOutGlue]>; +def AArch64_smstop : SDNode<"AArch64ISD::SMSTOP", SDTypeProfile<0, 3, + [SDTCisInt<0>, SDTCisInt<0>, SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect, SDNPVariadic, + SDNPOptInGlue, SDNPOutGlue]>; + //===----------------------------------------------------------------------===// // Add vector elements horizontally or vertically to ZA tile. //===----------------------------------------------------------------------===// @@ -117,8 +126,8 @@ // It's tricky to using the existing pstate operand defined in // AArch64SystemOperands.td since it only encodes 5 bits including op1;op2, // when these fields are also encoded in CRm[3:1]. -class MSRpstatesvcrImm0_1 - : PstateWriteSimple<(ins svcr_op:$pstatefield, imm0_1:$imm), "msr", +def MSRpstatesvcrImm1 + : PstateWriteSimple<(ins svcr_op:$pstatefield, timm0_1:$imm), "msr", "\t$pstatefield, $imm">, Sched<[WriteSys]> { bits<3> pstatefield; @@ -129,7 +138,6 @@ let Inst{7-5} = 0b011; // op2 } -def MSRpstatesvcrImm1 : MSRpstatesvcrImm0_1; def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>; def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>; @@ -138,6 +146,35 @@ def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>; def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>; + +// Scenario A: +// +// %pstate.before.call = 1 +// if (%pstate.before.call != 0) +// smstop (pstate_za|pstate_sm) +// call fn() +// if (%pstate.before.call != 0) +// smstart (pstate_za|pstate_sm) +// +def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 1), (i64 0)), // before call + (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>; +def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 1), (i64 0)), // after call + (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>; + +// Scenario B: +// +// %pstate.before.call = 0 +// if (%pstate.before.call != 1) +// smstart (pstate_za|pstate_sm) +// call fn() +// if (%pstate.before.call != 1) +// smstop (pstate_za|pstate_sm) +// +def : Pat<(AArch64_smstart (i32 svcr_op:$pstate), (i64 0), (i64 1)), // before call + (MSRpstatesvcrImm1 svcr_op:$pstate, 0b1)>; +def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 0), (i64 1)), // after call + (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>; + // Read and write TPIDR2_EL0 def : Pat<(int_aarch64_sme_set_tpidr2 i64:$val), (MSR 0xde85, GPR64:$val)>; diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -0,0 +1,340 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +; This file tests the following combinations related to streaming-enabled functions: +; [ ] N -> S (Normal -> Streaming) +; [ ] S -> N (Streaming -> Normal) +; [ ] S -> S (Streaming -> Streaming) +; [ ] S -> SC (Streaming -> Streaming-compatible) +; +; The following combination is tested in sme-streaming-compatible-interface.ll +; [ ] SC -> S (Streaming-compatible -> Streaming) + +declare void @normal_callee() +declare void @streaming_callee() "aarch64_pstate_sm_enabled" +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible" + +; [x] N -> S +; [ ] S -> N +; [ ] S -> S +; [ ] S -> SC +define void @normal_caller_streaming_callee() nounwind { +; CHECK-LABEL: normal_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee() + ret void; +} + +; [ ] N -> S +; [x] S -> N +; [ ] S -> S +; [ ] S -> SC +define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_normal_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl normal_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @normal_callee() + ret void; +} + +; [ ] N -> S +; [ ] S -> N +; [x] S -> S +; [ ] S -> SC +define void @streaming_caller_streaming_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee() + ret void; +} + +; [ ] N -> S +; [ ] S -> N +; [ ] S -> S +; [x] S -> SC +define void @streaming_caller_streaming_compatible_callee() nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: streaming_caller_streaming_compatible_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_compatible_callee() + ret void; +} + +; +; Handle special cases here. +; + +; Call to function-pointer (with attribute) +define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind { +; CHECK-LABEL: call_to_function_pointer_streaming_enabled: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: blr x0 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void %p() "aarch64_pstate_sm_enabled" + ret void +} + +; Ensure NEON registers are preserved correctly. +define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind { +; CHECK-LABEL: smstart_clobber_simdfp: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + call void @streaming_callee() + ret <4 x i32> %x; +} + +; Ensure SVE registers are preserved correctly. +define @smstart_clobber_sve( %x) #0 { +; CHECK-LABEL: smstart_clobber_sve: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-18 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #18 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee() + ret %x; +} + +; Call streaming callee twice; there should be no spills/fills between the two +; calls since the registers should have already been clobbered. +define @smstart_clobber_sve_duplicate( %x) #0 { +; CHECK-LABEL: smstart_clobber_sve_duplicate: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-18 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #18 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee() + call void @streaming_callee() + ret %x; +} + +; Ensure smstart is not removed, because call to llvm.cos is not part of a chain. +define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_enabled" { +; CHECK-LABEL: call_to_intrinsic_without_chain: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #88] // 8-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: bl cos +; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #88] // 8-byte Folded Reload +; CHECK-NEXT: ldr d1, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: fadd d0, d1, d0 +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret +entry: + %res = call fast double @llvm.cos.f64(double %x) + %res.fadd = fadd fast double %res, %x + ret double %res.fadd +} + +declare double @llvm.cos.f64(double) + +attributes #0 = { nounwind "target-features"="+sve" }