diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -131,6 +131,11 @@ cl::desc("disable vector permute decomposition"), cl::init(true), cl::Hidden); +cl::opt DisableAutoPairedVecSt( + "disable-auto-paired-vec-st", + cl::desc("disable automatically generated 32byte paired vector stores"), + cl::init(true), cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -130,6 +130,8 @@ void lowerCRBitRestore(MachineBasicBlock::iterator II, unsigned FrameIndex) const; + void lowerOctWordSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; void lowerACCSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerACCRestore(MachineBasicBlock::iterator II, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -91,6 +91,8 @@ cl::Hidden, cl::init(false)); #endif +extern cl::opt DisableAutoPairedVecSt; + static unsigned offsetMinAlignForOpcode(unsigned OpC); PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) @@ -1199,6 +1201,59 @@ #endif } +static void spillRegPairs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator II, DebugLoc DL, + const TargetInstrInfo &TII, Register SrcReg, + unsigned FrameIndex, bool IsLittleEndian, + bool IsKilled, bool TwoPairs) { + unsigned Offset = 0; + if (TwoPairs) + Offset = IsLittleEndian ? 48 : 0; + else + Offset = IsLittleEndian ? 16 : 0; + Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2 + : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2; + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV)) + .addReg(Reg, getKillRegState(IsKilled)), + FrameIndex, Offset); + Offset += IsLittleEndian ? -16 : 16; + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV)) + .addReg(Reg + 1, getKillRegState(IsKilled)), + FrameIndex, Offset); + if (TwoPairs) { + Offset += IsLittleEndian ? -16 : 16; + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV)) + .addReg(Reg + 2, getKillRegState(IsKilled)), + FrameIndex, Offset); + Offset += IsLittleEndian ? -16 : 16; + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV)) + .addReg(Reg + 3, getKillRegState(IsKilled)), + FrameIndex, Offset); + } +} + +/// Remove any STXVP[X] instructions and split them out into a pair of +/// STXV[X] instructions if --disable-auto-paired-vec-st is specified on +/// the command line. +void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + assert(DisableAutoPairedVecSt && + "Expecting to do this only if paired vector stores are disabled."); + MachineInstr &MI = *II; // STXVP , + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const PPCSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + Register SrcReg = MI.getOperand(0).getReg(); + bool IsLittleEndian = Subtarget.isLittleEndian(); + bool IsKilled = MI.getOperand(0).isKill(); + spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled, + /* TwoPairs */ false); + // Discard the original instruction. + MBB.erase(II); +} + /// lowerACCSpilling - Generate the code for spilling the accumulator register. /// Similarly to other spills/reloads that use pseudo-ops, we do not actually /// eliminate the FrameIndex here nor compute the stack offset. We simply @@ -1228,12 +1283,17 @@ // adjust the offset of the store that is within the 64-byte stack slot. if (IsPrimed) BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg); - addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) - .addReg(Reg, getKillRegState(IsKilled)), - FrameIndex, IsLittleEndian ? 32 : 0); - addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) - .addReg(Reg + 1, getKillRegState(IsKilled)), - FrameIndex, IsLittleEndian ? 0 : 32); + if (DisableAutoPairedVecSt) + spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled, + /* TwoPairs */ true); + else { + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(Reg, getKillRegState(IsKilled)), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(Reg + 1, getKillRegState(IsKilled)), + FrameIndex, IsLittleEndian ? 0 : 32); + } if (IsPrimed && !IsKilled) BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg); @@ -1469,6 +1529,9 @@ } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) { lowerACCRestore(II, FrameIndex); return; + } else if (OpC == PPC::STXVP && DisableAutoPairedVecSt) { + lowerOctWordSpilling(II, FrameIndex); + return; } else if (OpC == PPC::SPILL_QUADWORD) { lowerQuadwordSpilling(II, FrameIndex); return; diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll --- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -1,10 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \ -; RUN: FileCheck %s --check-prefix=LE-PAIRED +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ +; RUN: --check-prefix=LE-PAIRED ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ -; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=BE-PAIRED +; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ +; RUN: FileCheck %s --check-prefix=BE-PAIRED ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ ; RUN: | FileCheck %s --check-prefix=LE-PWR9 diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll --- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -559,8 +559,10 @@ ; CHECK-NEXT: stxv vs3, 0(r3) ; CHECK-NEXT: xvf32gerpp acc1, v2, v2 ; CHECK-NEXT: xxmfacc acc1 -; CHECK-NEXT: stxvp vsp4, 64(r1) -; CHECK-NEXT: stxvp vsp6, 32(r1) +; CHECK-NEXT: stxv vs4, 80(r1) +; CHECK-NEXT: stxv vs5, 64(r1) +; CHECK-NEXT: stxv vs6, 48(r1) +; CHECK-NEXT: stxv vs7, 32(r1) ; CHECK-NEXT: bl testRedundantPrimeUnprimeF@notoc ; CHECK-NEXT: lxvp vsp0, 64(r1) ; CHECK-NEXT: lxvp vsp2, 32(r1) @@ -590,8 +592,10 @@ ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2 ; CHECK-BE-NEXT: xxmfacc acc1 -; CHECK-BE-NEXT: stxvp vsp4, 112(r1) -; CHECK-BE-NEXT: stxvp vsp6, 144(r1) +; CHECK-BE-NEXT: stxv vs4, 112(r1) +; CHECK-BE-NEXT: stxv vs5, 128(r1) +; CHECK-BE-NEXT: stxv vs6, 144(r1) +; CHECK-BE-NEXT: stxv vs7, 160(r1) ; CHECK-BE-NEXT: bl testRedundantPrimeUnprimeF ; CHECK-BE-NEXT: nop ; CHECK-BE-NEXT: lxvp vsp0, 112(r1) diff --git a/llvm/test/CodeGen/PowerPC/spill-vec-pair.ll b/llvm/test/CodeGen/PowerPC/spill-vec-pair.ll --- a/llvm/test/CodeGen/PowerPC/spill-vec-pair.ll +++ b/llvm/test/CodeGen/PowerPC/spill-vec-pair.ll @@ -1,10 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O3 \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ -; RUN: < %s | FileCheck %s +; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O3 \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ -; RUN: < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ +; RUN: --check-prefix=CHECK-BE define dso_local void @test(<256 x i1>* %vpp, <256 x i1>* %vp2) local_unnamed_addr #0 { ; CHECK-LABEL: test: ; CHECK: # %bb.0: # %entry