diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -237,6 +237,12 @@ ProfileSummaryInfo *PSI = nullptr; BlockFrequencyInfo *BFI = nullptr; + /// List of non-single value types. + FoldingSet VTListMap; + + /// Pool allocation for misc. objects that are created once per SelectionDAG. + BumpPtrAllocator Allocator; + /// The starting token. SDNode EntryNode; @@ -263,9 +269,6 @@ BumpPtrAllocator OperandAllocator; ArrayRecycler OperandRecycler; - /// Pool allocation for misc. objects that are created once per SelectionDAG. - BumpPtrAllocator Allocator; - /// Tracks dbg_value and dbg_label information through SDISel. SDDbgInfo *DbgInfo; @@ -2281,9 +2284,6 @@ SDNode *FindNodeOrInsertPos(const FoldingSetNodeID &ID, const SDLoc &DL, void *&InsertPos); - /// List of non-single value types. - FoldingSet VTListMap; - /// Maps to auto-CSE operations. std::vector CondCodeNodes; diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -1162,7 +1162,6 @@ #endif llvm_unreachable("This target-independent node should have been selected!"); case ISD::EntryToken: - llvm_unreachable("EntryToken should have been excluded from the schedule!"); case ISD::MERGE_VALUES: case ISD::TokenFactor: // fall thru break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1275,7 +1275,7 @@ // EntryNode could meaningfully have debug info if we can find it... SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) : TM(tm), OptLevel(OL), - EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)), + EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other, MVT::Glue)), Root(getEntryNode()) { InsertNode(&EntryNode); DbgInfo = new SDDbgInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6037,6 +6037,13 @@ (void)Res; } + SMEAttrs Attrs(MF.getFunction()); + bool IsLocallyStreaming = + !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody(); + assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); + SDValue Glue = Chain.getValue(1); + + SmallVector ArgValues; unsigned ExtraArgLocs = 0; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; @@ -6091,7 +6098,22 @@ // Transform the arguments in physical registers into virtual ones. Register Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); + + if (IsLocallyStreaming) { + // LocallyStreamingFunctions must insert the SMSTART in the correct + // position, so we use Glue to ensure no instructions can be scheduled + // between the chain of: + // t0: ch,glue = EntryNode + // t1: res,ch,glue = CopyFromReg + // ... + // tn: res,ch,glue = CopyFromReg t(n-1), .. + // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2 + // ^^^^^^ + // This will be the new Chain/Root node. + ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue); + Glue = ArgValue.getValue(2); + } else + ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted // to 64 bits. Insert an assert[sz]ext to capture this, then @@ -6245,6 +6267,27 @@ } assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); + // Insert the SMSTART if this is a locally streaming function and + // make sure it is Glued to the last CopyFromReg value. + if (IsLocallyStreaming) { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + Chain = DAG.getNode( + AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), + {DAG.getRoot(), + DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64), + DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()), Glue}); + // Ensure that the SMSTART happens after the CopyWithChain such that its + // chain result is used. + for (unsigned I=0; IisTargetDarwin() || IsWin64) { @@ -7485,6 +7528,19 @@ } } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + + // Emit SMSTOP before returning from a locally streaming function + SMEAttrs FuncAttrs(MF.getFunction()); + if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) { + Chain = DAG.getNode( + AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, + DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32), + DAG.getConstant(1, DL, MVT::i64), DAG.getConstant(0, DL, MVT::i64), + DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask())); + Flag = Chain.getValue(1); + } + SmallVector RetOps(1, Chain); for (auto &RetVal : RetVals) { Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); @@ -7509,7 +7565,6 @@ DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } - const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF); if (I) { for (; *I; ++I) { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4256,6 +4256,8 @@ break; case AArch64::ADDVL_XXI: case AArch64::ADDPL_XXI: + case AArch64::ADDSVL_XXI: + case AArch64::ADDSPL_XXI: MaxEncoding = 31; ShiftSize = 0; if (Offset < 0) { @@ -4270,9 +4272,9 @@ // `Offset` can be in bytes or in "scalable bytes". int VScale = 1; - if (Opc == AArch64::ADDVL_XXI) + if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI) VScale = 16; - else if (Opc == AArch64::ADDPL_XXI) + else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI) VScale = 2; // FIXME: If the offset won't fit in 24-bits, compute the offset into a @@ -4369,6 +4371,14 @@ bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFAOffset, StackOffset CFAOffset, unsigned FrameReg) { + // If a function is marked as arm_locally_streaming, then the runtime value of + // vscale in the prologue/epilogue is different the runtime value of vscale + // in the function's body. To avoid having to consider multiple vscales, + // we can use `addsvl` to allocate any scalable stack-slots, which under + // most circumstances will be only locals, not callee-save slots. + const Function &F = MBB.getParent()->getFunction(); + bool UseSVL = F.hasFnAttribute("aarch64_pstate_sm_body"); + int64_t Bytes, NumPredicateVectors, NumDataVectors; AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( Offset, Bytes, NumPredicateVectors, NumDataVectors); @@ -4399,8 +4409,9 @@ if (NumDataVectors) { emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, - AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr, - EmitCFAOffset, CFAOffset, FrameReg); + UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI, + TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, + CFAOffset, FrameReg); CFAOffset += StackOffset::getScalable(-NumDataVectors * 16); SrcReg = DestReg; } @@ -4408,8 +4419,9 @@ if (NumPredicateVectors) { assert(DestReg != AArch64::SP && "Unaligned access to SP"); emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, - AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr, - EmitCFAOffset, CFAOffset, FrameReg); + UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI, + TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset, + CFAOffset, FrameReg); } } diff --git a/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll b/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll --- a/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll +++ b/llvm/test/CodeGen/AArch64/sme-get-pstatesm.ll @@ -22,7 +22,17 @@ define i64 @get_pstatesm_locally_streaming() nounwind "aarch64_pstate_sm_body" { ; CHECK-LABEL: get_pstatesm_locally_streaming: ; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret %pstate = call i64 @llvm.aarch64.sme.get.pstatesm() ret i64 %pstate diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s + +declare void @normal_callee(); +declare void @streaming_callee() "aarch64_pstate_sm_enabled"; +declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible"; + +define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: bl streaming_compatible_callee +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + + call void @streaming_compatible_callee(); + call void @streaming_compatible_callee(); + ret void; +} + +; Test that a streaming body and streaming interface, no smstart/smstop are emitted, +; because the function already is in streaming mode upon entry. +define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: streaming_and_locally_streaming_caller_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: bl streaming_callee +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @streaming_callee(); + call void @streaming_callee(); + ret void; +} + +define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_multiple_exit: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: cmp x0, #1 +; CHECK-NEXT: b.ne .LBB2_2 +; CHECK-NEXT: // %bb.1: // %if.else +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_2: // %if.end +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + +entry: + %tobool = icmp eq i64 %cond, 1 + br i1 %tobool, label %if.else, label %if.end + +if.else: + ret void; + +if.end: + ret void; +} + +; Do a fixed-width vector add on a NEON vector. +; This tests that: +; * Incoming vector in v0.d isn't clobbered by the change in streaming mode. +; * Result vector is correctly preserved after smstop. +define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_no_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret + + %add = add <2 x i64> %a, ; + ret <2 x i64> %add; +} + +; Test that we use the interface (not the function's body) to determine what +; streaming-mode to enter the callee. In this case the interface is normal, so +; pstate.sm must be 0 on entry and is 0 upon return from the callee. +define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl locally_streaming_caller_streaming_callee +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + + call void @locally_streaming_caller_streaming_callee(); + ret void; +} + +; +; Test that a locally streaming function correctly retains the +; argument/result registers, because smstart/smstop instructions that are +; inserted to implement the arm_locally_streaming attribute thrashes the +; vector register contents. +; + +define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: bl streaming_compatible_callee_vec_args_ret +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: ret + %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible" + ret <2 x i64> %res; +} + +declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_pstate_sm_compatible" + +define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind { +; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: bl streaming_compatible_callee_vec_arg_struct_ret +; CHECK-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: ret + %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1 + %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible" + ret {<2 x i64>, <2 x i64>} %res; +} + +declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64>) "aarch64_pstate_sm_compatible" + +; Test that we use `addsvl` for allocating any stack space for locals before `smstart`, +; such that the correct amount of stack space is allocated. +define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" { +; CHECK-LABEL: locally_streaming_caller_alloca: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: addsvl sp, sp, #-1 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: bl use_ptr +; CHECK-NEXT: smstop sm +; CHECK-NEXT: addsvl sp, sp, #1 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloca = alloca + call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible" + ret void +} + +declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible" + +define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" { +; CHECK-LABEL: call_to_intrinsic_without_chain: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload +; CHECK-NEXT: bl cos +; CHECK-NEXT: str d0, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: smstart sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr d0, [sp, #72] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call fast double @llvm.cos.f64(double %x) + ret double %0 +} + +declare double @llvm.cos.f64(double) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests.ll @@ -10,7 +10,7 @@ define void @foo(<8 x i64>* %a) #0 { ; CHECK-LABEL: foo: ; CHECK: SelectionDAG has 14 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t12: nxv2i1 = PTRUE_D TargetConstant:i32<31> ; CHECK-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; CHECK-NEXT: t18: nxv2i64,ch = LD1D_IMM t12, t2, TargetConstant:i64<0>, t0 diff --git a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll --- a/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/sdag-print-divergence.ll @@ -7,7 +7,7 @@ ; GCN: Initial selection DAG: %bb.0 'test_sdag_dump:entry' ; GCN: SelectionDAG has 10 nodes: -; GCN-DEFAULT: t0: ch = EntryToken +; GCN-DEFAULT: t0: ch,glue = EntryToken ; GCN-DEFAULT: t2: f32,ch = CopyFromReg t0, Register:f32 %0 ; GCN-DEFAULT: t5: f32 = fadd t2, t2 ; GCN-DEFAULT: t4: f32,ch = CopyFromReg # D:1 t0, Register:f32 %1 @@ -15,7 +15,7 @@ ; GCN-DEFAULT: t8: ch,glue = CopyToReg # D:1 t0, Register:f32 $vgpr0, t6 ; GCN-DEFAULT: t9: ch = RETURN_TO_EPILOG # D:1 t8, Register:f32 $vgpr0, t8:1 -; GCN-VERBOSE: t0: ch = EntryToken # D:0 +; GCN-VERBOSE: t0: ch,glue = EntryToken # D:0 ; GCN-VERBOSE: t2: f32,ch = CopyFromReg [ORD=1] # D:0 t0, Register:f32 %0 # D:0 ; GCN-VERBOSE: t5: f32 = fadd [ORD=2] # D:0 t2, t2 ; GCN-VERBOSE: t4: f32,ch = CopyFromReg [ORD=1] # D:1 t0, Register:f32 %1 # D:0 diff --git a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll --- a/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-bb-exports.ll @@ -5,7 +5,7 @@ ; inlineasm_br. Not sure how to get a MachineIR change so this reads the debug ; output from SelectionDAG. -; CHECK: t0: ch = EntryToken +; CHECK: t0: ch,glue = EntryToken ; CHECK-NEXT: t4: i32,ch = CopyFromReg t0, Register:i32 %3 ; CHECK-NEXT: t10: i32 = add t4, Constant:i32<1> ; CHECK-NEXT: t12: ch = CopyToReg t0, Register:i32 %0, t10 diff --git a/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll --- a/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll +++ b/llvm/test/CodeGen/X86/merge-store-partially-alias-loads.ll @@ -14,7 +14,7 @@ ; X86-NEXT: retq ; DBGDAG-LABEL: Optimized legalized selection DAG: %bb.0 'merge_store_partial_overlap_load:' -; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken +; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch,glue = EntryToken ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]], ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add {{(nuw )?}}[[BASEPTR]], Constant:i64<2> diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected @@ -4,7 +4,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 9 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t11: ch,glue = CopyToReg t0, Register:i32 $vgpr0, IMPLICIT_DEF:i32 ; CHECK-NEXT: t17: i32 = V_MOV_B32_e32 TargetConstant:i32<0> ; CHECK-NEXT: t13: ch,glue = CopyToReg t11, Register:i32 $vgpr1, t17, t11:1 @@ -20,7 +20,7 @@ ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 8 nodes: ; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5 ; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1 ; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1 @@ -36,7 +36,7 @@ ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 8 nodes: ; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5 ; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1 ; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1 @@ -52,7 +52,7 @@ ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 8 nodes: ; CHECK-NEXT: t5: i32 = V_MOV_B32_e32 TargetConstant:i32<0> -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t7: ch,glue = CopyToReg t0, Register:i32 $vgpr0, t5 ; CHECK-NEXT: t9: ch,glue = CopyToReg t7, Register:i32 $vgpr1, t5, t7:1 ; CHECK-NEXT: t10: ch = SI_RETURN Register:i32 $vgpr0, Register:i32 $vgpr1, t9, t9:1 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected @@ -4,7 +4,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-LABEL: i64_test: ; CHECK: SelectionDAG has 22 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t5: i32,ch = LDW_RI TargetFrameIndex:i32<-2>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0 ; CHECK-NEXT: t7: i32 = ADD_I_LO TargetFrameIndex:i32<0>, TargetConstant:i32<0> ; CHECK-NEXT: t29: i32 = OR_I_LO t7, TargetConstant:i32<4> @@ -29,7 +29,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; CHECK-LABEL: i32_test: ; CHECK: SelectionDAG has 14 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t21: i32,ch = CopyFromReg t0, Register:i32 $r0 ; CHECK-NEXT: t13: ch,glue = CopyToReg t0, Register:i32 $rv, t21 ; CHECK-NEXT: t3: i32,ch = LDW_RI TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0 @@ -48,7 +48,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; CHECK-LABEL: i16_test: ; CHECK: SelectionDAG has 19 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t33: i32,ch = CopyFromReg t0, Register:i32 $r0 ; CHECK-NEXT: t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33 ; CHECK-NEXT: t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0> @@ -71,7 +71,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; CHECK-LABEL: i8_test: ; CHECK: SelectionDAG has 20 nodes: -; CHECK-NEXT: t0: ch = EntryToken +; CHECK-NEXT: t0: ch,glue = EntryToken ; CHECK-NEXT: t33: i32,ch = CopyFromReg t0, Register:i32 $r0 ; CHECK-NEXT: t14: ch,glue = CopyToReg t0, Register:i32 $rv, t33 ; CHECK-NEXT: t1: i32 = ADD_I_LO TargetFrameIndex:i32<-1>, TargetConstant:i32<0> diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_isel.ll.expected @@ -5,7 +5,7 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; PIC-LABEL: i64_test: ; PIC: SelectionDAG has 12 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; PIC-NEXT: t7: i64,i32,ch = ADD64rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; PIC-NEXT: t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7 @@ -14,7 +14,7 @@ ; ; WIN-LABEL: i64_test: ; WIN: SelectionDAG has 12 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i64,ch = CopyFromReg t0, Register:i64 %0 ; WIN-NEXT: t7: i64,i32,ch = ADD64rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t10: ch,glue = CopyToReg t0, Register:i64 $rax, t7 @@ -29,7 +29,7 @@ define i64 @i32_test(i32 %i) nounwind readnone { ; PIC-LABEL: i32_test: ; PIC: SelectionDAG has 15 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; PIC-NEXT: t7: i32,i32,ch = ADD32rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; PIC-NEXT: t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6> @@ -39,7 +39,7 @@ ; ; WIN-LABEL: i32_test: ; WIN: SelectionDAG has 15 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; WIN-NEXT: t7: i32,i32,ch = ADD32rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t8: i64 = SUBREG_TO_REG TargetConstant:i64<0>, t7, TargetConstant:i32<6> @@ -56,7 +56,7 @@ define i64 @i16_test(i16 %i) nounwind readnone { ; PIC-LABEL: i16_test: ; PIC: SelectionDAG has 18 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; PIC-NEXT: t3: i16 = EXTRACT_SUBREG t2, TargetConstant:i32<4> ; PIC-NEXT: t8: i16,i32,ch = ADD16rm t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 @@ -68,7 +68,7 @@ ; ; WIN-LABEL: i16_test: ; WIN: SelectionDAG has 16 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i16,ch = CopyFromReg t0, Register:i16 %0 ; WIN-NEXT: t7: i16,i32,ch = ADD16rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t14: i32 = MOVZX32rr16 t7 @@ -86,7 +86,7 @@ define i64 @i8_test(i8 %i) nounwind readnone { ; PIC-LABEL: i8_test: ; PIC: SelectionDAG has 18 nodes: -; PIC-NEXT: t0: ch = EntryToken +; PIC-NEXT: t0: ch,glue = EntryToken ; PIC-NEXT: t2: i32,ch = CopyFromReg t0, Register:i32 %0 ; PIC-NEXT: t3: i8 = EXTRACT_SUBREG t2, TargetConstant:i32<1> ; PIC-NEXT: t8: i8,i32,ch = ADD8rm t3, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 @@ -98,7 +98,7 @@ ; ; WIN-LABEL: i8_test: ; WIN: SelectionDAG has 16 nodes: -; WIN-NEXT: t0: ch = EntryToken +; WIN-NEXT: t0: ch,glue = EntryToken ; WIN-NEXT: t2: i8,ch = CopyFromReg t0, Register:i8 %0 ; WIN-NEXT: t7: i8,i32,ch = ADD8rm t2, TargetFrameIndex:i64<0>, TargetConstant:i8<1>, Register:i64 $noreg, TargetConstant:i32<0>, Register:i16 $noreg, t0 ; WIN-NEXT: t14: i32 = MOVZX32rr8 t7