Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -980,6 +980,8 @@ virtual bool enableClusterLoads() const { return false; } + virtual bool enableClusterStores() const { return false; } + virtual bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const { Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -71,8 +71,13 @@ static cl::opt EnableCyclicPath("misched-cyclicpath", cl::Hidden, cl::desc("Enable cyclic critical path analysis."), cl::init(true)); -static cl::opt EnableLoadCluster("misched-cluster", cl::Hidden, - cl::desc("Enable load clustering."), cl::init(true)); +static cl::opt EnableLoadCluster("misched-cluster-load", cl::Hidden, + cl::desc("Enable load clustering."), + cl::init(true)); + +static cl::opt EnableStoreCluster("misched-cluster-store", cl::Hidden, + cl::desc("Enable store clustering."), + cl::init(true)); // Experimental heuristics static cl::opt EnableMacroFusion("misched-fusion", cl::Hidden, @@ -1351,64 +1356,81 @@ } //===----------------------------------------------------------------------===// -// LoadClusterMutation - DAG post-processing to cluster loads. +// BaseLdStClusterMutation - DAG post-processing to cluster loads or stores. //===----------------------------------------------------------------------===// namespace { /// \brief Post-process the DAG to create cluster edges between neighboring -/// loads. -class LoadClusterMutation : public ScheduleDAGMutation { - struct LoadInfo { +/// loads or between neighboring stores. +class BaseLdStClusterMutation : public ScheduleDAGMutation { + struct LoadStoreInfo { SUnit *SU; unsigned BaseReg; int64_t Offset; - LoadInfo(SUnit *su, unsigned reg, int64_t ofs) - : SU(su), BaseReg(reg), Offset(ofs) {} + LoadStoreInfo(SUnit *su, unsigned reg, int64_t ofs) + : SU(su), BaseReg(reg), Offset(ofs) {} - bool operator<(const LoadInfo &RHS) const { + bool operator<(const LoadStoreInfo &RHS) const { return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset); } }; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + bool isLoad; + public: - LoadClusterMutation(const TargetInstrInfo *tii, - const TargetRegisterInfo *tri) - : TII(tii), TRI(tri) {} + BaseLdStClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri, bool isLoad) + : TII(tii), TRI(tri), isLoad(isLoad) {} void apply(ScheduleDAGInstrs *DAGInstrs) override; + protected: - void clusterNeighboringLoads(ArrayRef Loads, ScheduleDAGMI *DAG); + void clusterNeighboringLdSt(ArrayRef MemOps, ScheduleDAGMI *DAG); +}; + +class StoreClusterMutation : public BaseLdStClusterMutation { +public: + StoreClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri) + : BaseLdStClusterMutation(tii, tri, false) {} +}; + +class LoadClusterMutation : public BaseLdStClusterMutation { +public: + LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) + : BaseLdStClusterMutation(tii, tri, true) {} }; } // anonymous -void LoadClusterMutation::clusterNeighboringLoads(ArrayRef Loads, - ScheduleDAGMI *DAG) { - SmallVector LoadRecords; - for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { - SUnit *SU = Loads[Idx]; +void BaseLdStClusterMutation::clusterNeighboringLdSt(ArrayRef MemOps, + ScheduleDAGMI *DAG) { + SmallVector LdStRecords; + for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) { + SUnit *SU = MemOps[Idx]; unsigned BaseReg; int64_t Offset; if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) - LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); + LdStRecords.push_back(LoadStoreInfo(SU, BaseReg, Offset)); } - if (LoadRecords.size() < 2) + if (LdStRecords.size() < 2) return; - std::sort(LoadRecords.begin(), LoadRecords.end()); + + std::sort(LdStRecords.begin(), LdStRecords.end()); unsigned ClusterLength = 1; - for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { - if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { + for (unsigned Idx = 0, End = LdStRecords.size(); Idx < (End - 1); ++Idx) { + if (LdStRecords[Idx].BaseReg != LdStRecords[Idx+1].BaseReg) { ClusterLength = 1; continue; } - SUnit *SUa = LoadRecords[Idx].SU; - SUnit *SUb = LoadRecords[Idx+1].SU; + + SUnit *SUa = LdStRecords[Idx].SU; + SUnit *SUb = LdStRecords[Idx+1].SU; if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - - DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" + DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); // Copy successor edges from SUa to SUb. Interleaving computation // dependent on SUa can prevent load combining due to register reuse. @@ -1429,7 +1451,8 @@ } /// \brief Callback from DAG postProcessing to create cluster edges for loads. -void LoadClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { +void BaseLdStClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast(DAGInstrs); // Map DAG NodeNum to store chain ID. @@ -1438,8 +1461,10 @@ SmallVector, 32> StoreChainDependents; for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { SUnit *SU = &DAG->SUnits[Idx]; - if (!SU->getInstr()->mayLoad()) + if ((isLoad && !SU->getInstr()->mayLoad()) || + (!isLoad && !SU->getInstr()->mayStore())) continue; + unsigned ChainPredID = DAG->SUnits.size(); for (SUnit::const_pred_iterator PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { @@ -1457,9 +1482,10 @@ StoreChainDependents.resize(NumChains + 1); StoreChainDependents[Result.first->second].push_back(SU); } + // Iterate over the store chains. for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) - clusterNeighboringLoads(StoreChainDependents[Idx], DAG); + clusterNeighboringLdSt(StoreChainDependents[Idx], DAG); } //===----------------------------------------------------------------------===// @@ -3052,8 +3078,13 @@ // data and pass it to later mutations. Have a single mutation that gathers // the interesting nodes in one pass. DAG->addMutation(make_unique(DAG->TII, DAG->TRI)); + if (EnableLoadCluster && DAG->TII->enableClusterLoads()) DAG->addMutation(make_unique(DAG->TII, DAG->TRI)); + + if (EnableStoreCluster && DAG->TII->enableClusterStores()) + DAG->addMutation(make_unique(DAG->TII, DAG->TRI)); + if (EnableMacroFusion) DAG->addMutation(make_unique(*DAG->TII, *DAG->TRI)); return DAG; Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -109,6 +109,8 @@ bool enableClusterLoads() const override { return true; } + bool enableClusterStores() const override { return true; } + bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const override; Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1388,6 +1388,11 @@ case AArch64::LDRWui: case AArch64::LDRSWui: // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURXi: + case AArch64::STURWi: case AArch64::LDURSi: case AArch64::LDURDi: case AArch64::LDURQi: @@ -1496,15 +1501,20 @@ default: return false; case AArch64::LDURQi: + case AArch64::STURQi: OffsetStride = 16; break; case AArch64::LDURXi: case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: OffsetStride = 8; break; case AArch64::LDURWi: case AArch64::LDURSi: case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: OffsetStride = 4; break; } Index: test/CodeGen/AArch64/aarch64-stp-cluster.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -0,0 +1,149 @@ +; REQUIRES: asserts +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s + +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_i64_scale:BB#0 +; CHECK:Cluster ld/st SU(4) - SU(3) +; CHECK:Cluster ld/st SU(2) - SU(5) +; CHECK:SU(4): STRXui %vreg1, %vreg0, 1 +; CHECK:SU(3): STRXui %vreg1, %vreg0, 2 +; CHECK:SU(2): STRXui %vreg1, %vreg0, 3 +; CHECK:SU(5): STRXui %vreg1, %vreg0, 4 +define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 + store i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 + store i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 + store i64 %v, i64* %arrayidx3 + ret i64 %v +} + +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_i32_scale:BB#0 +; CHECK:Cluster ld/st SU(4) - SU(3) +; CHECK:Cluster ld/st SU(2) - SU(5) +; CHECK:SU(4): STRWui %vreg1, %vreg0, 1 +; CHECK:SU(3): STRWui %vreg1, %vreg0, 2 +; CHECK:SU(2): STRWui %vreg1, %vreg0, 3 +; CHECK:SU(5): STRWui %vreg1, %vreg0, 4 +define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) { +entry: + %arrayidx = getelementptr inbounds i32, i32* %P, i32 3 + store i32 %v, i32* %arrayidx + %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 2 + store i32 %v, i32* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 1 + store i32 %v, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 4 + store i32 %v, i32* %arrayidx3 + ret i32 %v +} + +; CHECK:********** MI Scheduling ********** +; CHECK-LABEL:stp_i64_unscale:BB#0 entry +; CHECK:Cluster ld/st SU(5) - SU(2) +; CHECK:Cluster ld/st SU(4) - SU(3) +; CHECK:SU(5): STURXi %vreg1, %vreg0, -32 +; CHECK:SU(2): STURXi %vreg1, %vreg0, -24 +; CHECK:SU(4): STURXi %vreg1, %vreg0, -16 +; CHECK:SU(3): STURXi %vreg1, %vreg0, -8 +define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 { +entry: + %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3 + store i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2 + store i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4 + store i64 %v, i64* %arrayidx3 + ret void +} + +; CHECK:********** MI Scheduling ********** +; CHECK-LABEL:stp_i32_unscale:BB#0 entry +; CHECK:Cluster ld/st SU(5) - SU(2) +; CHECK:Cluster ld/st SU(4) - SU(3) +; CHECK:SU(5): STURWi %vreg1, %vreg0, -16 +; CHECK:SU(2): STURWi %vreg1, %vreg0, -12 +; CHECK:SU(4): STURWi %vreg1, %vreg0, -8 +; CHECK:SU(3): STURWi %vreg1, %vreg0, -4 +define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 { +entry: + %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3 + store i32 %v, i32* %arrayidx + %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1 + store i32 %v, i32* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2 + store i32 %v, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4 + store i32 %v, i32* %arrayidx3 + ret void +} + +; CHECK:********** MI Scheduling ********** +; CHECK-LABEL:stp_double:BB#0 +; CHECK:Cluster ld/st SU(3) - SU(4) +; CHECK:Cluster ld/st SU(2) - SU(5) +; CHECK:SU(3): STRDui %vreg1, %vreg0, 1 +; CHECK:SU(4): STRDui %vreg1, %vreg0, 2 +; CHECK:SU(2): STRDui %vreg1, %vreg0, 3 +; CHECK:SU(5): STRDui %vreg1, %vreg0, 4 +define void @stp_double(double* nocapture %P, double %v) { +entry: + %arrayidx = getelementptr inbounds double, double* %P, i64 3 + store double %v, double* %arrayidx + %arrayidx1 = getelementptr inbounds double, double* %P, i64 1 + store double %v, double* %arrayidx1 + %arrayidx2 = getelementptr inbounds double, double* %P, i64 2 + store double %v, double* %arrayidx2 + %arrayidx3 = getelementptr inbounds double, double* %P, i64 4 + store double %v, double* %arrayidx3 + ret void +} + +; CHECK:********** MI Scheduling ********** +; CHECK-LABEL:stp_float:BB#0 +; CHECK:Cluster ld/st SU(3) - SU(4) +; CHECK:Cluster ld/st SU(2) - SU(5) +; CHECK:SU(3): STRSui %vreg1, %vreg0, 1 +; CHECK:SU(4): STRSui %vreg1, %vreg0, 2 +; CHECK:SU(2): STRSui %vreg1, %vreg0, 3 +; CHECK:SU(5): STRSui %vreg1, %vreg0, 4 +define void @stp_float(float* nocapture %P, float %v) { +entry: + %arrayidx = getelementptr inbounds float, float* %P, i64 3 + store float %v, float* %arrayidx + %arrayidx1 = getelementptr inbounds float, float* %P, i64 1 + store float %v, float* %arrayidx1 + %arrayidx2 = getelementptr inbounds float, float* %P, i64 2 + store float %v, float* %arrayidx2 + %arrayidx3 = getelementptr inbounds float, float* %P, i64 4 + store float %v, float* %arrayidx3 + ret void +} + +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_volatile:BB#0 +; CHECK-NOT: Cluster ld/st +; CHECK:SU(2): STRXui %vreg1, %vreg0, 3; mem:Volatile +; CHECK:SU(3): STRXui %vreg1, %vreg0, 2; mem:Volatile +; CHECK:SU(4): STRXui %vreg1, %vreg0, 1; mem:Volatile +; CHECK:SU(5): STRXui %vreg1, %vreg0, 4; mem:Volatile +define i64 @stp_volatile(i64* nocapture %P, i64 %v) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 + store volatile i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 + store volatile i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 + store volatile i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 + store volatile i64 %v, i64* %arrayidx3 + ret i64 %v +} + Index: test/CodeGen/AArch64/arm64-ldp-cluster.ll =================================================================== --- test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -4,7 +4,7 @@ ; Test ldr clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldr_int:BB#0 -; CHECK: Cluster loads SU(1) - SU(2) +; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui define i32 @ldr_int(i32* %a) nounwind { @@ -19,7 +19,7 @@ ; Test ldpsw clustering ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldp_sext_int:BB#0 -; CHECK: Cluster loads SU(1) - SU(2) +; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRSWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRSWui define i64 @ldp_sext_int(i32* %p) nounwind { @@ -35,7 +35,7 @@ ; Test ldur clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldur_int:BB#0 -; CHECK: Cluster loads SU(2) - SU(1) +; CHECK: Cluster ld/st SU(2) - SU(1) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDURWi ; CHECK: SU(2): %vreg{{[0-9]+}} = LDURWi define i32 @ldur_int(i32* %a) nounwind { @@ -50,7 +50,7 @@ ; Test sext + zext clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldp_half_sext_zext_int:BB#0 -; CHECK: Cluster loads SU(3) - SU(4) +; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRSWui ; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind { @@ -68,7 +68,7 @@ ; Test zext + sext clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldp_half_zext_sext_int:BB#0 -; CHECK: Cluster loads SU(3) - SU(4) +; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui ; CHECK: SU(4): %vreg{{[0-9]+}} = LDRSWui define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind { @@ -86,7 +86,7 @@ ; Verify we don't cluster volatile loads. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldr_int_volatile:BB#0 -; CHECK-NOT: Cluster loads +; CHECK-NOT: Cluster ld/st ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui define i32 @ldr_int_volatile(i32* %a) nounwind { Index: test/CodeGen/AArch64/arm64-stp.ll =================================================================== --- test/CodeGen/AArch64/arm64-stp.ll +++ test/CodeGen/AArch64/arm64-stp.ll @@ -100,9 +100,9 @@ ; Read of %b to compute %tmp2 shouldn't prevent formation of stp ; CHECK-LABEL: stp_int_rar_hazard -; CHECK: stp w0, w1, [x2] ; CHECK: ldr [[REG:w[0-9]+]], [x2, #8] -; CHECK: add w0, [[REG]], w1 +; CHECK: add w8, [[REG]], w1 +; CHECK: stp w0, w1, [x2] ; CHECK: ret define i32 @stp_int_rar_hazard(i32 %a, i32 %b, i32* nocapture %p) nounwind { store i32 %a, i32* %p, align 4 Index: test/CodeGen/AArch64/global-merge-group-by-use.ll =================================================================== --- test/CodeGen/AArch64/global-merge-group-by-use.ll +++ test/CodeGen/AArch64/global-merge-group-by-use.ll @@ -64,8 +64,8 @@ define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 { ; CHECK-NEXT: adrp x8, [[SET3]]@PAGE ; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF -; CHECK-NEXT: stp w0, w1, [x8, #4] -; CHECK-NEXT: str w2, [x8] +; CHECK-NEXT: stp w2, w0, [x8] +; CHECK-NEXT: str w1, [x8, #8] ; CHECK-NEXT: ret store i32 %a1, i32* @m4, align 4 store i32 %a2, i32* @n4, align 4