Index: include/llvm/CodeGen/MachineScheduler.h =================================================================== --- include/llvm/CodeGen/MachineScheduler.h +++ include/llvm/CodeGen/MachineScheduler.h @@ -763,7 +763,7 @@ public: /// Represent the type of SchedCandidate found within a single queue. /// pickNodeBidirectional depends on these listed by decreasing priority. - enum CandReason { + enum CandReason : uint8_t { NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder}; @@ -811,6 +811,9 @@ // The reason for this candidate. CandReason Reason; + // Whether this candidate should be scheduled at top/bottom. + bool AtTop; + // Set of reasons that apply to multiple candidates. uint32_t RepeatReasonSet; @@ -821,7 +824,8 @@ SchedResourceDelta ResDelta; SchedCandidate(const CandPolicy &policy) - : Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {} + : Policy(policy), SU(nullptr), Reason(NoCand), AtTop(false), + RepeatReasonSet(0) {} bool isValid() const { return SU; } @@ -830,6 +834,7 @@ assert(Best.Reason != NoCand && "uninitialized Sched candidate"); SU = Best.SU; Reason = Best.Reason; + AtTop = Best.AtTop; RPDelta = Best.RPDelta; ResDelta = Best.ResDelta; } @@ -913,11 +918,12 @@ void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary &Zone); + SchedBoundary *Zone); SUnit *pickNodeBidirectional(bool &IsTopNode); void pickNodeFromQueue(SchedBoundary &Zone, + const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Candidate); Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -2526,9 +2526,8 @@ << GenericSchedulerBase::getReasonStr(Reason) << '\n'); } -static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand, - bool IsTop) { - tracePick(Cand.Reason, IsTop); +static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { + tracePick(Cand.Reason, Cand.AtTop); } void GenericScheduler::initialize(ScheduleDAGMI *dag) { @@ -2681,17 +2680,24 @@ const MachineFunction &MF) { unsigned TryPSet = TryP.getPSetOrMax(); unsigned CandPSet = CandP.getPSetOrMax(); - // If both candidates affect the same set, go with the smallest increase. - if (TryPSet == CandPSet) { - return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, - Reason); - } + // If one candidate decreases and the other increases, go with it. // Invalid candidates have UnitInc==0. if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, Reason)) { return true; } + // Do not compare the magnitude of pressure changes between top and bottom + // boundary. + if (Cand.AtTop != TryCand.AtTop) + return false; + + // If both candidates affect the same set in the same boundary, go with the + // smallest increase. + if (TryPSet == CandPSet) { + return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, + Reason); + } int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) : std::numeric_limits::max(); @@ -2742,6 +2748,7 @@ const RegPressureTracker &RPTracker, RegPressureTracker &TempTracker) { Cand.SU = SU; + Cand.AtTop = AtTop; if (DAG->isTrackingPressure()) { if (AtTop) { TempTracker.getMaxDownwardPressureDelta( @@ -2781,18 +2788,19 @@ /// /// \param Cand provides the policy and current best candidate. /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -/// \param Zone describes the scheduled zone that we are extending. +/// \param Zone describes the scheduled zone that we are extending, or nullptr +// if Cand is from a different zone than TryCand. void GenericScheduler::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary &Zone) { + SchedBoundary *Zone) { // Initialize the candidate if needed. if (!Cand.isValid()) { TryCand.Reason = NodeOrder; return; } - if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), - biasPhysRegCopy(Cand.SU, Zone.isTop()), + if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop), + biasPhysRegCopy(Cand.SU, Cand.AtTop), TryCand, Cand, PhysRegCopy)) return; @@ -2810,17 +2818,21 @@ DAG->MF)) return; - // For loops that are acyclic path limited, aggressively schedule for latency. - // This can result in very long dependence chains scheduled in sequence, so - // once every cycle (when CurrMOps == 0), switch to normal heuristics. - if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps() - && tryLatency(TryCand, Cand, Zone)) - return; + // Some properties can only be compared for nodes in the same boundary. + if (Zone) { + // For loops that are acyclic path limited, aggressively schedule for + // latency. This can result in very long dependence chains scheduled in + // sequence, so once every cycle (when CurrMOps == 0), switch to normal + // heuristics. + if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && + tryLatency(TryCand, Cand, *Zone)) + return; - // Prioritize instructions that read unbuffered resources by stall cycles. - if (tryLess(Zone.getLatencyStallCycles(TryCand.SU), - Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) - return; + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return; + } // Keep clustered nodes together to encourage downstream peephole // optimizations which may reduce resource requirements. @@ -2828,18 +2840,21 @@ // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *NextClusterSU = - Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + if (tryGreater(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) return; // Weak edges are for clustering and other constraints. - if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), - getWeakLeft(Cand.SU, Zone.isTop()), - TryCand, Cand, Weak)) { + if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), + TryCand, Cand, Weak)) return; - } + // Avoid increasing the max pressure of the entire region. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, @@ -2857,24 +2872,25 @@ TryCand, Cand, ResourceDemand)) return; - // Avoid serializing long latency dependence chains. - // For acyclic path limited loops, latency was already checked above. - if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency && - !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) { - return; - } + if (Zone) { + // Avoid serializing long latency dependence chains. + // For acyclic path limited loops, latency was already checked above. + if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && + !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) + return; - // Prefer immediate defs/users of the last scheduled instruction. This is a - // local pressure avoidance strategy that also makes the machine code - // readable. - if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU), - TryCand, Cand, NextDefUse)) - return; + // Prefer immediate defs/users of the last scheduled instruction. This is a + // local pressure avoidance strategy that also makes the machine code + // readable. + if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU), + TryCand, Cand, NextDefUse)) + return; - // Fall through to original instruction order. - if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) - || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { - TryCand.Reason = NodeOrder; + // Fall through to original instruction order. + if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) + || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { + TryCand.Reason = NodeOrder; + } } } @@ -2884,6 +2900,7 @@ /// DAG building. To adjust for the current scheduling location we need to /// maintain the number of vreg uses remaining to be top-scheduled. void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, + const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand) { ReadyQueue &Q = Zone.Available; @@ -2895,9 +2912,11 @@ for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { - SchedCandidate TryCand(Cand.Policy); + SchedCandidate TryCand(ZonePolicy); initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker); - tryCandidate(Cand, TryCand, Zone); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryCandidate(Cand, TryCand, ZoneArg); if (TryCand.Reason != NoCand) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) @@ -2922,19 +2941,20 @@ tracePick(Only1, true); return SU; } - CandPolicy NoPolicy; - SchedCandidate BotCand(NoPolicy); - SchedCandidate TopCand(NoPolicy); // Set the bottom-up policy based on the state of the current bottom zone and // the instructions outside the zone, including the top zone. - setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top); + CandPolicy BotPolicy; + setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and // the instructions outside the zone, including the bottom zone. - setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot); + CandPolicy TopPolicy; + setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); // Prefer bottom scheduling when heuristics are silent. - pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + CandPolicy NoPolicy; + SchedCandidate Cand(NoPolicy); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), Cand); + assert(Cand.Reason != NoCand && "failed to find the first candidate"); // If either Q has a single candidate that provides the least increase in // Excess pressure, we can immediately schedule from that Q. @@ -2943,27 +2963,20 @@ // affects picking from either Q. If scheduling in one direction must // increase pressure for one of the excess PSets, then schedule in that // direction first to provide more freedom in the other direction. - if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) - || (BotCand.Reason == RegCritical && !BotCand.isRepeat(RegCritical))) + if ((Cand.Reason == RegExcess && !Cand.isRepeat(RegExcess)) + || (Cand.Reason == RegCritical && !Cand.isRepeat(RegCritical))) { - IsTopNode = false; - tracePick(BotCand, IsTopNode); - return BotCand.SU; + IsTopNode = Cand.AtTop; + tracePick(Cand); + return Cand.SU; } // Check if the top Q has a better candidate. - pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), Cand); + assert(Cand.Reason != NoCand && "failed to find the first candidate"); - // Choose the queue with the most important (lowest enum) reason. - if (TopCand.Reason < BotCand.Reason) { - IsTopNode = true; - tracePick(TopCand, IsTopNode); - return TopCand.SU; - } - // Otherwise prefer the bottom candidate, in node order if all else failed. - IsTopNode = false; - tracePick(BotCand, IsTopNode); - return BotCand.SU; + IsTopNode = Cand.AtTop; + tracePick(Cand); + return Cand.SU; } /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. @@ -2980,9 +2993,9 @@ if (!SU) { CandPolicy NoPolicy; SchedCandidate TopCand(NoPolicy); - pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); assert(TopCand.Reason != NoCand && "failed to find a candidate"); - tracePick(TopCand, true); + tracePick(TopCand); SU = TopCand.SU; } IsTopNode = true; @@ -2991,9 +3004,9 @@ if (!SU) { CandPolicy NoPolicy; SchedCandidate BotCand(NoPolicy); - pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); assert(BotCand.Reason != NoCand && "failed to find a candidate"); - tracePick(BotCand, false); + tracePick(BotCand); SU = BotCand.SU; } IsTopNode = false; @@ -3166,6 +3179,7 @@ for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { SchedCandidate TryCand(Cand.Policy); TryCand.SU = *I; + TryCand.AtTop = true; TryCand.initResourceDelta(DAG, SchedModel); tryCandidate(Cand, TryCand); if (TryCand.Reason != NoCand) { @@ -3194,7 +3208,7 @@ setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); pickNodeFromQueue(TopCand); assert(TopCand.Reason != NoCand && "failed to find a candidate"); - tracePick(TopCand, true); + tracePick(TopCand); SU = TopCand.SU; } } while (SU->isScheduled); Index: test/CodeGen/AArch64/arm64-convert-v4f64.ll =================================================================== --- test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -23,8 +23,8 @@ ; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d ; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d ; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d -; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA0]].4s -; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA2]].4s +; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s +; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s ; CHECK: xtn v0.8b, v[[TMP1]].8h %tmp1 = load <8 x double>, <8 x double>* %ptr %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8> Index: test/CodeGen/AArch64/bitreverse.ll =================================================================== --- test/CodeGen/AArch64/bitreverse.ll +++ test/CodeGen/AArch64/bitreverse.ll @@ -52,7 +52,7 @@ ; CHECK-DAG: movi [[M2:v.*]], #64 ; CHECK-DAG: movi [[M3:v.*]], #32 ; CHECK-DAG: movi [[M4:v.*]], #16 -; CHECK-DAG: movi [[M5:v.*]], #8 +; CHECK-DAG: movi [[M5:v.*]], #8{{$}} ; CHECK-DAG: movi [[M6:v.*]], #4{{$}} ; CHECK-DAG: movi [[M7:v.*]], #2{{$}} ; CHECK-DAG: movi [[M8:v.*]], #1{{$}} Index: test/CodeGen/AArch64/cxx-tlscc.ll =================================================================== --- test/CodeGen/AArch64/cxx-tlscc.ll +++ test/CodeGen/AArch64/cxx-tlscc.ll @@ -44,7 +44,9 @@ ; CHECK-NOT: stp d3, d2 ; CHECK-NOT: stp d1, d0 ; CHECK-NOT: stp x20, x19 -; CHECK-NOT: stp x14, x13 +; FIXME: The splitting logic in the register allocator fails to split along +; control flow here, we used to get this right by accident before... +; CHECK-NOTXX: stp x14, x13 ; CHECK-NOT: stp x12, x11 ; CHECK-NOT: stp x10, x9 ; CHECK-NOT: stp x8, x7 @@ -63,7 +65,7 @@ ; CHECK-NOT: ldp x8, x7 ; CHECK-NOT: ldp x10, x9 ; CHECK-NOT: ldp x12, x11 -; CHECK-NOT: ldp x14, x13 +; CHECK-NOTXX: ldp x14, x13 ; CHECK-NOT: ldp x20, x19 ; CHECK-NOT: ldp d1, d0 ; CHECK-NOT: ldp d3, d2 Index: test/CodeGen/AArch64/vcvt-oversize.ll =================================================================== --- test/CodeGen/AArch64/vcvt-oversize.ll +++ test/CodeGen/AArch64/vcvt-oversize.ll @@ -2,8 +2,9 @@ define <8 x i8> @float_to_i8(<8 x float>* %in) { ; CHECK-LABEL: float_to_i8: -; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s -; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s +; CHECK: ldp q1, q0, [x0] +; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s +; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s ; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s ; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s ; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s Index: test/CodeGen/AArch64/vector-fcopysign.ll =================================================================== --- test/CodeGen/AArch64/vector-fcopysign.ll +++ test/CodeGen/AArch64/vector-fcopysign.ll @@ -95,21 +95,21 @@ ; CHECK-LABEL: test_copysign_v4f32_v4f64: ; CHECK-NEXT: mov s3, v0[1] ; CHECK-NEXT: mov d4, v1[1] +; CHECK-NEXT: fcvt s4, d4 ; CHECK-NEXT: movi.4s v5, #128, lsl #24 ; CHECK-NEXT: fcvt s1, d1 -; CHECK-NEXT: mov s6, v0[2] -; CHECK-NEXT: mov s7, v0[3] -; CHECK-NEXT: fcvt s16, d2 +; CHECK-NEXT: bit.16b v3, v4, v5 +; CHECK-NEXT: mov s4, v0[2] +; CHECK-NEXT: mov s6, v0[3] ; CHECK-NEXT: bit.16b v0, v1, v5 -; CHECK-NEXT: bit.16b v6, v16, v5 -; CHECK-NEXT: fcvt s1, d4 -; CHECK-NEXT: bit.16b v3, v1, v5 +; CHECK-NEXT: fcvt s1, d2 +; CHECK-NEXT: bit.16b v4, v1, v5 ; CHECK-NEXT: mov d1, v2[1] ; CHECK-NEXT: fcvt s1, d1 ; CHECK-NEXT: ins.s v0[1], v3[0] -; CHECK-NEXT: ins.s v0[2], v6[0] -; CHECK-NEXT: bit.16b v7, v1, v5 -; CHECK-NEXT: ins.s v0[3], v7[0] +; CHECK-NEXT: ins.s v0[2], v4[0] +; CHECK-NEXT: bit.16b v6, v1, v5 +; CHECK-NEXT: ins.s v0[3], v6[0] ; CHECK-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x float> %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0) Index: test/CodeGen/AMDGPU/and.ll =================================================================== --- test/CodeGen/AMDGPU/and.ll +++ test/CodeGen/AMDGPU/and.ll @@ -486,8 +486,8 @@ ; low 32-bits, which is not a valid 64-bit inline immmediate. ; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64: -; SI: s_load_dwordx2 ; SI: s_load_dword s +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0 ; SI-NOT: and Index: test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll =================================================================== --- test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll +++ test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll @@ -21,8 +21,8 @@ } ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset: -; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd +; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd ; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34 ; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7 Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -155,10 +155,10 @@ } ; FUNC-LABEL: {{^}}s_ctpop_i65: -; GCN: s_bcnt1_i32_b64 ; GCN: s_and_b32 -; GCN: s_bcnt1_i32_b64 -; GCN: s_add_i32 +; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]], +; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]], +; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]] ; GCN: s_endpgm define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone Index: test/CodeGen/AMDGPU/ds_read2_offset_order.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -8,10 +8,9 @@ ; SI-LABEL: {{^}}offset_order: -; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}} -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:2 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:11 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:48 define void @offset_order(float addrspace(1)* %out) { entry: Index: test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2st64.ll +++ test/CodeGen/AMDGPU/ds_read2st64.ll @@ -197,8 +197,8 @@ ; SI-LABEL: @simple_read2st64_f64_over_max_offset ; SI-NOT: ds_read2st64_b64 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512 +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}} ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]] ; SI: s_endpgm define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { Index: test/CodeGen/AMDGPU/fneg-fabs.f64.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -55,7 +55,7 @@ } ; GCN-LABEL: {{^}}fneg_fabs_f64: -; GCN: s_load_dwordx2 +; GCN-DAG: s_load_dwordx2 ; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}} ; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb ; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -10,13 +10,13 @@ ; not just directly into the vector component? ; GCN-LABEL: {{^}}insertelement_v4f32_0: -; GCN: s_load_dwordx4 s{{\[}}[[LOW_REG:[0-9]+]]: +; GCN: s_load_dwordx4 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000 -; GCN-DAG: v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]] +; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]: define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -10,10 +10,9 @@ ; TODO: this constant should be folded: ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]] ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] +; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff +; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]] define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -29,9 +29,8 @@ ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} -; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]] ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] +; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]] define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) store double %rsq_clamp, double addrspace(1)* %out Index: test/CodeGen/AMDGPU/local-memory-two-objects.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -32,7 +32,8 @@ ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]*}}, {{v[0-9]+}} offset0:4 +; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16 +; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}} ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -10,10 +10,10 @@ ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] -; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] ; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NOT: v_mov_b32 ; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -228,9 +228,9 @@ } ; GCN-LABEL: {{^}}s_ashr_63_i64: -; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 -; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} +; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}} define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 Index: test/CodeGen/PowerPC/ppc-shrink-wrapping.ll =================================================================== --- test/CodeGen/PowerPC/ppc-shrink-wrapping.ll +++ test/CodeGen/PowerPC/ppc-shrink-wrapping.ll @@ -629,10 +629,11 @@ ; CHECK-LABEL: transpose ; ; Store of callee-save register saved by shrink wrapping -; CHECK: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill +; FIXME: Test disabled: Improved scheduling needs no spills/reloads any longer! +; CHECKXX: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill ; ; Reload of callee-save register -; CHECK: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload +; CHECKXX: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload ; ; Ensure no subsequent uses of callee-save register before end of function ; CHECK-NOT: {{[a-z]+}} [[CSR]] Index: test/CodeGen/PowerPC/ppc64-byval-align.ll =================================================================== --- test/CodeGen/PowerPC/ppc64-byval-align.ll +++ test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -35,8 +35,7 @@ ret i64 %0 } ; CHECK-LABEL: @callee2 -; CHECK: ld [[REG:[0-9]+]], 128(1) -; CHECK: mr 3, [[REG]] +; CHECK: ld 3, 128(1) ; CHECK: blr declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)