diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -174,6 +174,9 @@ "HasAddisLoadFusion", "true", "Power8 Addis-Load fusion", [FeatureFusion]>; +def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true", + "Target supports store clustering", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -345,10 +348,12 @@ // Power10 // For P10 CPU we assume that all of the existing features from Power9 // still exist with the exception of those we know are Power9 specific. + list FusionFeatures = [FeatureStoreFusion]; list P10AdditionalFeatures = - [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, - FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA, - FeaturePairedVectorMemops]; + !listconcat(FusionFeatures, [ + DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, + FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA, + FeaturePairedVectorMemops]); list P10SpecificFeatures = []; list P10InheritableFeatures = !listconcat(P9InheritableFeatures, P10AdditionalFeatures); diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -494,6 +494,19 @@ int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; + /// Get the base operand and byte offset of an instruction that reads/writes + /// memory. + bool getMemOperandsWithOffsetWidth( + const MachineInstr &MI, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; + + /// Returns true if the two given memory operations should be scheduled + /// adjacent. + bool shouldClusterMemOps(ArrayRef BaseOps1, + ArrayRef BaseOps2, + unsigned NumLoads, unsigned NumBytes) const override; + /// Return true if two MIs access different memory addresses and false /// otherwise bool diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2222,6 +2222,112 @@ return true; } +bool PPCInstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { + const MachineOperand *BaseOp; + if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI)) + return false; + BaseOps.push_back(BaseOp); + return true; +} + +static bool isLdStSafeToCluster(const MachineInstr &LdSt, + const TargetRegisterInfo *TRI) { + // If this is a volatile load/store, don't mess with it. + if (LdSt.hasOrderedMemoryRef()) + return false; + + if (LdSt.getOperand(2).isFI()) + return true; + + assert(LdSt.getOperand(2).isReg() && "Expected a reg operand."); + // Can't cluster if the instruction modifies the base register + // or it is update form. e.g. ld r2,3(r2) + if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI)) + return false; + + return true; +} + +// Only cluster instruction pair that have the same opcode, and they are +// clusterable according to PowerPC specification. +static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc, + const PPCSubtarget &Subtarget) { + switch (FirstOpc) { + default: + return false; + case PPC::STD: + case PPC::STFD: + case PPC::STXSD: + case PPC::DFSTOREf64: + return FirstOpc == SecondOpc; + // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with + // 32bit and 64bit instruction selection. They are clusterable pair though + // they are different opcode. + case PPC::STW: + case PPC::STW8: + return SecondOpc == PPC::STW || SecondOpc == PPC::STW8; + } +} + +bool PPCInstrInfo::shouldClusterMemOps( + ArrayRef BaseOps1, + ArrayRef BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { + + assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); + const MachineOperand &BaseOp1 = *BaseOps1.front(); + const MachineOperand &BaseOp2 = *BaseOps2.front(); + assert(BaseOp1.isReg() || + BaseOp1.isFI() && + "Only base registers and frame indices are supported."); + + // The NumLoads means the number of loads that has been clustered. + // Don't cluster memory op if there are already two ops clustered at least. + if (NumLoads > 2) + return false; + + // Cluster the load/store only when they have the same base + // register or FI. + if ((BaseOp1.isReg() != BaseOp2.isReg()) || + (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) || + (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex())) + return false; + + // Check if the load/store are clusterable according to the PowerPC + // specification. + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + // Cluster the load/store only when they have the same opcode, and they are + // clusterable opcode according to PowerPC specification. + if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget)) + return false; + + // Can't cluster load/store that have ordered or volatile memory reference. + if (!isLdStSafeToCluster(FirstLdSt, TRI) || + !isLdStSafeToCluster(SecondLdSt, TRI)) + return false; + + int64_t Offset1 = 0, Offset2 = 0; + unsigned Width1 = 0, Width2 = 0; + const MachineOperand *Base1 = nullptr, *Base2 = nullptr; + if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) || + !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) || + Width1 != Width2) + return false; + + assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 && + "getMemOperandWithOffsetWidth return incorrect base op"); + // The caller should already have ordered FirstMemOp/SecondMemOp by offset. + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + Width1 == Offset2; +} + /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. /// @@ -4664,7 +4770,8 @@ return false; // Handle only loads/stores with base register followed by immediate offset. - if (LdSt.getNumExplicitOperands() != 3) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; if (!LdSt.getOperand(1).isImm() || (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -137,6 +137,7 @@ bool HasHTM; bool HasFloat128; bool HasFusion; + bool HasStoreFusion; bool HasAddiLoadFusion; bool HasAddisLoadFusion; bool IsISA3_0; @@ -308,6 +309,7 @@ bool isISA3_1() const { return IsISA3_1; } bool useLongCalls() const { return UseLongCalls; } bool hasFusion() const { return HasFusion; } + bool hasStoreFusion() const { return HasStoreFusion; } bool hasAddiLoadFusion() const { return HasAddiLoadFusion; } bool hasAddisLoadFusion() const { return HasAddisLoadFusion; } bool needsSwapsForVSXMemOps() const { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -108,6 +108,7 @@ HasHTM = false; HasFloat128 = false; HasFusion = false; + HasStoreFusion = false; HasAddiLoadFusion = false; HasAddisLoadFusion = false; IsISA3_0 = false; diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -271,6 +271,8 @@ std::make_unique(C)); // add DAG Mutations here. DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); + if (ST.hasStoreFusion()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); @@ -285,6 +287,8 @@ std::make_unique(C) : std::make_unique(C), true); // add DAG Mutations here. + if (ST.hasStoreFusion()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); return DAG; diff --git a/llvm/test/CodeGen/PowerPC/fusion-load-store.ll b/llvm/test/CodeGen/PowerPC/fusion-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/fusion-load-store.ll @@ -0,0 +1,268 @@ +; Test if several consecutive loads/stores can be clustered(fused) by scheduler. The +; scheduler will print "Cluster ld/st SU(x) - SU(y)" if SU(x) and SU(y) are fused. + +; REQUIRES: asserts +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 \ +; RUN: -mattr=-paired-vector-memops,-pcrelative-memops -verify-misched \ +; RUN: -debug-only=machine-scheduler 2>&1 | FileCheck %s + +define i64 @store_i64(i64* nocapture %P, i64 %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24 +; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16 +; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8 +; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64:%bb.0 +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 16 +; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 8 +; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 24 +; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32 + %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 + store i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 + store i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 + store i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 + store i64 %v, i64* %arrayidx3 + ret i64 %v +} + +define i32 @store_i32(i32* nocapture %P, i32 %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, 52 +; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, 48 +; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, 44 +; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, 56 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32:%bb.0 +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], 48 +; CHECK: SU([[SU1]]): STW renamable $r[[REG]], 44 +; CHECK: SU([[SU2]]): STW renamable $r[[REG]], 52 +; CHECK: SU([[SU3]]): STW renamable $r[[REG]], 56 + %arrayidx = getelementptr inbounds i32, i32* %P, i32 13 + store i32 %v, i32* %arrayidx + %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 12 + store i32 %v, i32* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 11 + store i32 %v, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 14 + store i32 %v, i32* %arrayidx3 + ret i32 %v +} + +define void @store_i64_neg(i64* nocapture %P, i64 %v) #0 { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, -24 +; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, -8 +; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, -16 +; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, -32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i64_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], -8 +; CHECK: SU([[SU1]]): STD renamable $x[[REG]], -16 +; CHECK: SU([[SU2]]): STD renamable $x[[REG]], -24 +; CHECK: SU([[SU3]]): STD renamable $x[[REG]], -32 + %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3 + store i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1 + store i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2 + store i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4 + store i64 %v, i64* %arrayidx3 + ret void +} + +define void @store_i32_neg(i32* nocapture %P, i32 %v) #0 { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: SU([[SU2]]): STW %[[REG:[0-9]+]].sub_32:g8rc, -12 +; CHECK: SU([[SU3]]): STW %[[REG]].sub_32:g8rc, -4 +; CHECK: SU([[SU4]]): STW %[[REG]].sub_32:g8rc, -8 +; CHECK: SU([[SU5]]): STW %[[REG]].sub_32:g8rc, -16 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_neg:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK:SU([[SU0]]): STW renamable $r[[REG:[0-9]+]], -4 +; CHECK:SU([[SU1]]): STW renamable $r[[REG]], -8 +; CHECK:SU([[SU2]]): STW renamable $r[[REG]], -12 +; CHECK:SU([[SU3]]): STW renamable $r[[REG]], -16 + %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3 + store i32 %v, i32* %arrayidx + %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1 + store i32 %v, i32* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2 + store i32 %v, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4 + store i32 %v, i32* %arrayidx3 + ret void +} + +define void @store_double(double* nocapture %P, double %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_double:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU2]]): DFSTOREf64 %[[REG:[0-9]+]]:vsfrc, 24 +; CHECK: SU([[SU3]]): DFSTOREf64 %[[REG]]:vsfrc, 8 +; CHECK: SU([[SU4]]): DFSTOREf64 %[[REG]]:vsfrc, 16 +; CHECK: SU([[SU5]]): DFSTOREf64 %[[REG]]:vsfrc, 32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_double:%bb.0 +; CHECK: Cluster ld/st SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU0]]): STFD renamable $f[[REG:[0-9]+]], 8 +; CHECK: SU([[SU1]]): STFD renamable $f[[REG]], 16 +; CHECK: SU([[SU2]]): STFD renamable $f[[REG]], 24 +; CHECK: SU([[SU3]]): STFD renamable $f[[REG]], 32 + %arrayidx = getelementptr inbounds double, double* %P, i64 3 + store double %v, double* %arrayidx + %arrayidx1 = getelementptr inbounds double, double* %P, i64 1 + store double %v, double* %arrayidx1 + %arrayidx2 = getelementptr inbounds double, double* %P, i64 2 + store double %v, double* %arrayidx2 + %arrayidx3 = getelementptr inbounds double, double* %P, i64 4 + store double %v, double* %arrayidx3 + ret void +} + +define void @store_float(float* nocapture %P, float %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_float:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU2]]): DFSTOREf32 %[[REG:[0-9]+]]:vssrc, 12 +; CHECK: SU([[SU3]]): DFSTOREf32 %[[REG]]:vssrc, 4 +; CHECK: SU([[SU4]]): DFSTOREf32 %[[REG]]:vssrc, 8 +; CHECK: SU([[SU5]]): DFSTOREf32 %[[REG]]:vssrc, 16 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_float:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU0]]): STFS renamable $f[[REG:[0-9]+]], 12 +; CHECK: SU([[SU1]]): STFS renamable $f[[REG]], 4 +; CHECK: SU([[SU2]]): STFS renamable $f[[REG]], 8 +; CHECK: SU([[SU3]]): STFS renamable $f[[REG]], 16 + %arrayidx = getelementptr inbounds float, float* %P, i64 3 + store float %v, float* %arrayidx + %arrayidx1 = getelementptr inbounds float, float* %P, i64 1 + store float %v, float* %arrayidx1 + %arrayidx2 = getelementptr inbounds float, float* %P, i64 2 + store float %v, float* %arrayidx2 + %arrayidx3 = getelementptr inbounds float, float* %P, i64 4 + store float %v, float* %arrayidx3 + ret void +} + +; Cannot fuse the store/load if there is volatile in between +define i64 @store_volatile(i64* nocapture %P, i64 %v) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_volatile:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU2]]): STD %[[REG:[0-9]+]]:g8rc, 24 +; CHECK: SU([[SU3]]): STD %[[REG]]:g8rc, 16 +; CHECK: SU([[SU4]]): STD %[[REG]]:g8rc, 8 +; CHECK: SU([[SU5]]): STD %[[REG]]:g8rc, 32 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_volatile:%bb.0 +; CHECK-NOT: Cluster ld/st +; CHECK: SU([[SU0]]): STD renamable $x[[REG:[0-9]+]], 24 +; CHECK: SU([[SU1]]): STD renamable $x[[REG]], 16 +; CHECK: SU([[SU2]]): STD renamable $x[[REG]], 8 +; CHECK: SU([[SU3]]): STD renamable $x[[REG]], 32 + %arrayidx = getelementptr inbounds i64, i64* %P, i64 3 + store volatile i64 %v, i64* %arrayidx + %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2 + store volatile i64 %v, i64* %arrayidx1 + %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1 + store volatile i64 %v, i64* %arrayidx2 + %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4 + store volatile i64 %v, i64* %arrayidx3 + ret i64 %v +} + +@p = common local_unnamed_addr global [100 x i32] zeroinitializer, align 4 + +define void @store_i32_stw_stw8(i32 signext %m, i32 signext %n) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU8:[0-9]+]]) +; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 24 +; CHECK: SU([[SU8]]): STW %{{[0-9]+}}:gprc, 20 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU5:[0-9]+]]) - SU([[SU6:[0-9]+]]) +; CHECK: SU([[SU5]]): STW8 renamable $x{{[0-9]+}}, 24 +; CHECK: SU([[SU6]]): STW renamable $r{{[0-9]+}}, 20 + store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4 + store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4 + %add = add nsw i32 %n, %m + store i32 %add, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 5), align 4 + ret void +} + +define void @store_i32_stw8(i32 signext %m, i32 signext %n) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU4:[0-9]+]]) - SU([[SU5:[0-9]+]]) +; CHECK: SU([[SU4]]): STW8 %{{[0-9]+}}:g8rc, 24 +; CHECK: SU([[SU5]]): STW8 %{{[0-9]+}}:g8rc, 28 +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_i32_stw8:%bb.0 +; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]]) +; CHECK: SU([[SU3]]): STW8 renamable $x{{[0-9]+}}, 24 +; CHECK: SU([[SU4]]): STW8 renamable $x{{[0-9]+}}, 28 + store i32 9, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 6), align 4 + store i32 %n, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @p, i64 0, i64 7), align 4 + ret void +} + +declare void @bar(i64*) + +define void @store_frame_index(i32 %a, i32 %b) { +entry: +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: store_frame_index:%bb.0 +; CHECK: Cluster ld/st SU([[SU2:[0-9]+]]) - SU([[SU3:[0-9]+]]) +; CHECK: SU([[SU2]]): STD %{{[0-9]+}}:g8rc, 0, %stack.0.buf +; CHECK: SU([[SU3]]): STD %{{[0-9]+}}:g8rc, 8, %stack.0.buf + %buf = alloca [8 x i64], align 8 + %0 = bitcast [8 x i64]* %buf to i8* + %conv = zext i32 %a to i64 + %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 0 + store i64 %conv, i64* %arrayidx, align 8 + %conv1 = zext i32 %b to i64 + %arrayidx2 = getelementptr inbounds [8 x i64], [8 x i64]* %buf, i64 0, i64 1 + store i64 %conv1, i64* %arrayidx2, align 8 + call void @bar(i64* nonnull %arrayidx) + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll --- a/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll +++ b/llvm/test/CodeGen/PowerPC/pcrel-call-linkage-leaf.ll @@ -104,6 +104,7 @@ ; CHECK-P9-NOT: .localentry ; CHECK-ALL: # %bb.0: # %entry ; CHECK-S-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; CHECK-S-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-S-NEXT: add r11, r4, r3 ; CHECK-S-NEXT: sub r29, r8, r9 ; CHECK-S-NEXT: add r9, r10, r9 @@ -119,7 +120,6 @@ ; CHECK-S-NEXT: mullw r3, r3, r7 ; CHECK-S-NEXT: sub r2, r6, r7 ; CHECK-S-NEXT: mullw r3, r3, r8 -; CHECK-S-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-S-NEXT: add r30, r8, r7 ; CHECK-S-NEXT: mullw r3, r3, r2 ; CHECK-S-NEXT: mullw r3, r3, r30