diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2241,8 +2241,15 @@ // Use !IsLiveIn for the kill flag. // We do not want to kill registers that are live in this function // before their use because they will become undefined registers. - TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, - CSI[i].getFrameIdx(), RC, TRI); + // Functions without NoUnwind need to preserve the order of elements in + // saved vector registers. + if (Subtarget.needsSwapsForVSXMemOps() && + !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) + TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, + CSI[i].getFrameIdx(), RC, TRI); + else + TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(), + RC, TRI); } } } @@ -2394,7 +2401,16 @@ } else { // Default behavior for non-CR saves. const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + + // Functions without NoUnwind need to preserve the order of elements in + // saved vector registers. + if (Subtarget.needsSwapsForVSXMemOps() && + !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) + TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC, + TRI); + else + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + assert(I != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -296,12 +296,30 @@ const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + // Emits a register spill without updating the register class for vector + // registers. This ensures that when we spill a vector register the + // element order in the register is the same as it was in memory. + void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; + // Emits a register reload without updating the register class for vector + // registers. This ensures that when we reload a vector register the + // element order in the register is the same as it was in memory. + void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + unsigned getStoreOpcodeForSpill(unsigned Reg, const TargetRegisterClass *RC = nullptr) const; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1222,24 +1222,13 @@ FuncInfo->setHasNonRISpills(); } -void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - Register SrcReg, bool isKill, - int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void PPCInstrInfo::storeRegToStackSlotNoUpd( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); SmallVector NewMIs; - // We need to avoid a situation in which the value from a VRRC register is - // spilled using an Altivec instruction and reloaded into a VSRC register - // using a VSX instruction. The issue with this is that the VSX - // load/store instructions swap the doublewords in the vector and the Altivec - // ones don't. The register classes on the spill/reload may be different if - // the register is defined using an Altivec instruction and is then used by a - // VSX instruction. - RC = updatedRC(RC); - StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs); for (unsigned i = 0, e = NewMIs.size(); i != e; ++i) @@ -1253,6 +1242,23 @@ NewMIs.back()->addMemOperand(MF, MMO); } +void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register SrcReg, bool isKill, + int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + // We need to avoid a situation in which the value from a VRRC register is + // spilled using an Altivec instruction and reloaded into a VSRC register + // using a VSX instruction. The issue with this is that the VSX + // load/store instructions swap the doublewords in the vector and the Altivec + // ones don't. The register classes on the spill/reload may be different if + // the register is defined using an Altivec instruction and is then used by a + // VSX instruction. + RC = updatedRC(RC); + storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI); +} + void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, unsigned DestReg, int FrameIdx, const TargetRegisterClass *RC, @@ -1274,12 +1280,10 @@ FuncInfo->setHasNonRISpills(); } -void -PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - Register DestReg, int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void PPCInstrInfo::loadRegFromStackSlotNoUpd( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, + int FrameIdx, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); SmallVector NewMIs; DebugLoc DL; @@ -1288,16 +1292,6 @@ PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setHasSpills(); - // We need to avoid a situation in which the value from a VRRC register is - // spilled using an Altivec instruction and reloaded into a VSRC register - // using a VSX instruction. The issue with this is that the VSX - // load/store instructions swap the doublewords in the vector and the Altivec - // ones don't. The register classes on the spill/reload may be different if - // the register is defined using an Altivec instruction and is then used by a - // VSX instruction. - if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass) - RC = &PPC::VSRCRegClass; - LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs); for (unsigned i = 0, e = NewMIs.size(); i != e; ++i) @@ -1311,6 +1305,23 @@ NewMIs.back()->addMemOperand(MF, MMO); } +void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + // We need to avoid a situation in which the value from a VRRC register is + // spilled using an Altivec instruction and reloaded into a VSRC register + // using a VSX instruction. The issue with this is that the VSX + // load/store instructions swap the doublewords in the vector and the Altivec + // ones don't. The register classes on the spill/reload may be different if + // the register is defined using an Altivec instruction and is then used by a + // VSX instruction. + RC = updatedRC(RC); + + loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI); +} + bool PPCInstrInfo:: reverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 2 && "Invalid PPC branch opcode!"); diff --git a/llvm/test/CodeGen/PowerPC/CSR-fit.ll b/llvm/test/CodeGen/PowerPC/CSR-fit.ll --- a/llvm/test/CodeGen/PowerPC/CSR-fit.ll +++ b/llvm/test/CodeGen/PowerPC/CSR-fit.ll @@ -126,9 +126,9 @@ ; CHECK-PWR8-NEXT: .cfi_offset v20, -192 ; CHECK-PWR8-NEXT: .cfi_offset v21, -176 ; CHECK-PWR8-NEXT: li r5, 48 -; CHECK-PWR8-NEXT: stxvd2x v20, r1, r5 # 16-byte Folded Spill +; CHECK-PWR8-NEXT: stvx v20, r1, r5 # 16-byte Folded Spill ; CHECK-PWR8-NEXT: li r5, 64 -; CHECK-PWR8-NEXT: stxvd2x v21, r1, r5 # 16-byte Folded Spill +; CHECK-PWR8-NEXT: stvx v21, r1, r5 # 16-byte Folded Spill ; CHECK-PWR8-NEXT: #APP ; CHECK-PWR8-NEXT: add r3, r3, r4 ; CHECK-PWR8-NEXT: #NO_APP @@ -136,9 +136,9 @@ ; CHECK-PWR8-NEXT: bl callee ; CHECK-PWR8-NEXT: nop ; CHECK-PWR8-NEXT: li r4, 64 -; CHECK-PWR8-NEXT: lxvd2x v21, r1, r4 # 16-byte Folded Reload +; CHECK-PWR8-NEXT: lvx v21, r1, r4 # 16-byte Folded Reload ; CHECK-PWR8-NEXT: li r4, 48 -; CHECK-PWR8-NEXT: lxvd2x v20, r1, r4 # 16-byte Folded Reload +; CHECK-PWR8-NEXT: lvx v20, r1, r4 # 16-byte Folded Reload ; CHECK-PWR8-NEXT: addi r1, r1, 240 ; CHECK-PWR8-NEXT: ld r0, 16(r1) ; CHECK-PWR8-NEXT: mtlr r0 @@ -184,9 +184,9 @@ ; CHECK-PWR8-NEXT: .cfi_offset v20, -192 ; CHECK-PWR8-NEXT: .cfi_offset v21, -176 ; CHECK-PWR8-NEXT: li r5, 48 -; CHECK-PWR8-NEXT: stxvd2x v20, r1, r5 # 16-byte Folded Spill +; CHECK-PWR8-NEXT: stvx v20, r1, r5 # 16-byte Folded Spill ; CHECK-PWR8-NEXT: li r5, 64 -; CHECK-PWR8-NEXT: stxvd2x v21, r1, r5 # 16-byte Folded Spill +; CHECK-PWR8-NEXT: stvx v21, r1, r5 # 16-byte Folded Spill ; CHECK-PWR8-NEXT: #APP ; CHECK-PWR8-NEXT: add r3, r3, r4 ; CHECK-PWR8-NEXT: #NO_APP @@ -194,9 +194,9 @@ ; CHECK-PWR8-NEXT: bl callee ; CHECK-PWR8-NEXT: nop ; CHECK-PWR8-NEXT: li r4, 64 -; CHECK-PWR8-NEXT: lxvd2x v21, r1, r4 # 16-byte Folded Reload +; CHECK-PWR8-NEXT: lvx v21, r1, r4 # 16-byte Folded Reload ; CHECK-PWR8-NEXT: li r4, 48 -; CHECK-PWR8-NEXT: lxvd2x v20, r1, r4 # 16-byte Folded Reload +; CHECK-PWR8-NEXT: lvx v20, r1, r4 # 16-byte Folded Reload ; CHECK-PWR8-NEXT: addi r1, r1, 240 ; CHECK-PWR8-NEXT: ld r0, 16(r1) ; CHECK-PWR8-NEXT: mtlr r0 @@ -246,9 +246,9 @@ ; CHECK-PWR8-NEXT: li r5, 48 ; CHECK-PWR8-NEXT: std r14, 240(r1) # 8-byte Folded Spill ; CHECK-PWR8-NEXT: stfd f14, 384(r1) # 8-byte Folded Spill -; CHECK-PWR8-NEXT: stxvd2x v20, r1, r5 # 16-byte Folded Spill +; CHECK-PWR8-NEXT: stvx v20, r1, r5 # 16-byte Folded Spill ; CHECK-PWR8-NEXT: li r5, 64 -; CHECK-PWR8-NEXT: stxvd2x v21, r1, r5 # 16-byte Folded Spill +; CHECK-PWR8-NEXT: stvx v21, r1, r5 # 16-byte Folded Spill ; CHECK-PWR8-NEXT: #APP ; CHECK-PWR8-NEXT: add r3, r3, r4 ; CHECK-PWR8-NEXT: #NO_APP @@ -258,9 +258,9 @@ ; CHECK-PWR8-NEXT: li r4, 64 ; CHECK-PWR8-NEXT: lfd f14, 384(r1) # 8-byte Folded Reload ; CHECK-PWR8-NEXT: ld r14, 240(r1) # 8-byte Folded Reload -; CHECK-PWR8-NEXT: lxvd2x v21, r1, r4 # 16-byte Folded Reload +; CHECK-PWR8-NEXT: lvx v21, r1, r4 # 16-byte Folded Reload ; CHECK-PWR8-NEXT: li r4, 48 -; CHECK-PWR8-NEXT: lxvd2x v20, r1, r4 # 16-byte Folded Reload +; CHECK-PWR8-NEXT: lvx v20, r1, r4 # 16-byte Folded Reload ; CHECK-PWR8-NEXT: addi r1, r1, 528 ; CHECK-PWR8-NEXT: ld r0, 16(r1) ; CHECK-PWR8-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/reg-scavenging.ll b/llvm/test/CodeGen/PowerPC/reg-scavenging.ll --- a/llvm/test/CodeGen/PowerPC/reg-scavenging.ll +++ b/llvm/test/CodeGen/PowerPC/reg-scavenging.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: .cfi_offset lr, 16 ; CHECK-NEXT: .cfi_offset v20, -192 ; CHECK-NEXT: li r5, 48 -; CHECK-NEXT: stxvd2x v20, r1, r5 # 16-byte Folded Spill +; CHECK-NEXT: stvx v20, r1, r5 # 16-byte Folded Spill ; CHECK-NEXT: #APP ; CHECK-NEXT: add r3, r3, r4 ; CHECK-NEXT: #NO_APP @@ -20,7 +20,7 @@ ; CHECK-NEXT: bl callee ; CHECK-NEXT: nop ; CHECK-NEXT: li r4, 48 -; CHECK-NEXT: lxvd2x v20, r1, r4 # 16-byte Folded Reload +; CHECK-NEXT: lvx v20, r1, r4 # 16-byte Folded Reload ; CHECK-NEXT: addi r1, r1, 240 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll b/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll --- a/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll +++ b/llvm/test/CodeGen/PowerPC/vsxD-Form-spills.ll @@ -8,19 +8,19 @@ ; CHECK-DAG: li [[REG64:[0-9]+]], 64 ; CHECK-DAG: li [[REG80:[0-9]+]], 80 ; CHECK-DAG: li [[REG96:[0-9]+]], 96 -; CHECK-DAG: stxvd2x 60, 1, [[REG48]] # 16-byte Folded Spill -; CHECK-DAG: stxvd2x 61, 1, [[REG64]] # 16-byte Folded Spill -; CHECK-DAG: stxvd2x 62, 1, [[REG80]] # 16-byte Folded Spill -; CHECK-DAG: stxvd2x 63, 1, [[REG96]] # 16-byte Folded Spill +; CHECK-DAG: stvx 28, 1, [[REG48]] # 16-byte Folded Spill +; CHECK-DAG: stvx 29, 1, [[REG64]] # 16-byte Folded Spill +; CHECK-DAG: stvx 30, 1, [[REG80]] # 16-byte Folded Spill +; CHECK-DAG: stvx 31, 1, [[REG96]] # 16-byte Folded Spill ; CHECK: .LBB0_3 ; CHECK-DAG: li [[REG96_LD:[0-9]+]], 96 ; CHECK-DAG: li [[REG80_LD:[0-9]+]], 80 ; CHECK-DAG: li [[REG64_LD:[0-9]+]], 64 ; CHECK-DAG: li [[REG48_LD:[0-9]+]], 48 -; CHECK-DAG: lxvd2x 63, 1, [[REG96_LD]] # 16-byte Folded Reload -; CHECK-DAG: lxvd2x 62, 1, [[REG80_LD]] # 16-byte Folded Reload -; CHECK-DAG: lxvd2x 61, 1, [[REG64_LD]] # 16-byte Folded Reload -; CHECK-DAG: lxvd2x 60, 1, [[REG48_LD]] # 16-byte Folded Reload +; CHECK-DAG: lvx 31, 1, [[REG96_LD]] # 16-byte Folded Reload +; CHECK-DAG: lvx 30, 1, [[REG80_LD]] # 16-byte Folded Reload +; CHECK-DAG: lvx 29, 1, [[REG64_LD]] # 16-byte Folded Reload +; CHECK-DAG: lvx 28, 1, [[REG48_LD]] # 16-byte Folded Reload ; CHECK: mtlr 0 ; CHECK-NEXT: blr ;