Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -2800,6 +2800,32 @@ avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; +// Special instructions to help with spilling when we don't have VLX. We need +// to load or store from a ZMM register instead. These are converted in +// expandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in { +def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), + "", []>; +def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), + "", []>; +def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), + "", []>; +def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), + "", []>; +} + +let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs f128mem:$dst), (ins VR128X:$src), + "", []>; +def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs f256mem:$dst), (ins VR256X:$src), + "", []>; +def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs f128mem:$dst), (ins VR128X:$src), + "", []>; +def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs f256mem:$dst), (ins VR256X:$src), + "", []>; +} + def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), (v8i64 VR512:$src))), (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -2645,6 +2645,8 @@ case X86::VMOVAPDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVAPSZ256rm_NOVLX: case X86::VMOVAPSZrm: case X86::VMOVDQA32Z128rm: case X86::VMOVDQA32Z256rm: @@ -2666,6 +2668,8 @@ case X86::VMOVDQU8Zrm: case X86::VMOVUPSZ128rm: case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZ128rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && @@ -5013,24 +5017,44 @@ // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? - (HasVLX ? X86::VMOVAPSZ128rm : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm): - (HasVLX ? X86::VMOVAPSZ128mr : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + (HasVLX ? X86::VMOVAPSZ128rm : + HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : + HasAVX ? X86::VMOVAPSrm : + X86::MOVAPSrm): + (HasVLX ? X86::VMOVAPSZ128mr : + HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX : + HasAVX ? X86::VMOVAPSmr : + X86::MOVAPSmr); else return load ? - (HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm): - (HasVLX ? X86::VMOVUPSZ128mr : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + (HasVLX ? X86::VMOVUPSZ128rm : + HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX : + HasAVX ? X86::VMOVUPSrm : + X86::MOVUPSrm): + (HasVLX ? X86::VMOVUPSZ128mr : + HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX : + HasAVX ? X86::VMOVUPSmr : + X86::MOVUPSmr); } case 32: assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? - (HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm) : - (HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr); + (HasVLX ? X86::VMOVAPSZ256rm : + HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX : + X86::VMOVAPSYrm) : + (HasVLX ? X86::VMOVAPSZ256mr : + HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX : + X86::VMOVAPSYmr); else return load ? - (HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm) : - (HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr); + (HasVLX ? X86::VMOVUPSZ256rm : + HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX : + X86::VMOVUPSYrm) : + (HasVLX ? X86::VMOVUPSZ256mr : + HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX : + X86::VMOVUPSYmr); case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); assert(STI.hasAVX512() && "Using 512-bit register requires AVX512"); @@ -5852,6 +5876,53 @@ MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } +// This is used to handle spills for 128/256-bit registers when we have AVX512, +// but not VLX. If it uses an extended register we need to use an instruction +// that loads the lower 128/256-bit, but is available with only AVX512F. +static bool expandNOVLXLoad(MachineInstrBuilder &MIB, + const TargetRegisterInfo *TRI, + const MCInstrDesc &LoadDesc, + const MCInstrDesc &BroadcastDesc, + unsigned SubIdx) { + unsigned DestReg = MIB->getOperand(0).getReg(); + // Check if DestReg is XMM16-31 or YMM16-31. + if (TRI->getEncodingValue(DestReg) < 16) { + // We can use a normal VEX encoded load. + MIB->setDesc(LoadDesc); + } else { + // Use a 128/256-bit VBROADCAST instruction. + MIB->setDesc(BroadcastDesc); + // Change the destination to a 512-bit register. + DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass); + MIB->getOperand(0).setReg(DestReg); + } + return true; +} + +// This is used to handle spills for 128/256-bit registers when we have AVX512, +// but not VLX. If it uses an extended register we need to use an instruction +// that stores the lower 128/256-bit, but is available with only AVX512F. +static bool expandNOVLXStore(MachineInstrBuilder &MIB, + const TargetRegisterInfo *TRI, + const MCInstrDesc &StoreDesc, + const MCInstrDesc &ExtractDesc, + unsigned SubIdx) { + unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); + // Check if DestReg is XMM16-31 or YMM16-31. + if (TRI->getEncodingValue(SrcReg) < 16) { + // We can use a normal VEX encoded store. + MIB->setDesc(StoreDesc); + } else { + // Use a VEXTRACTF instruction. + MIB->setDesc(ExtractDesc); + // Change the destination to a 512-bit register. + SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass); + MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg); + MIB.addImm(0x0); // Append immediate to extract from the lower bits. + } + + return true; +} bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -5899,6 +5970,30 @@ .addReg(Reg, RegState::Undef).addImm(0xff); return true; } + case X86::VMOVAPSZ128rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), + get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + case X86::VMOVUPSZ128rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm), + get(X86::VBROADCASTF32X4rm), X86::sub_xmm); + case X86::VMOVAPSZ256rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm), + get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + case X86::VMOVUPSZ256rm_NOVLX: + return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm), + get(X86::VBROADCASTF64X4rm), X86::sub_ymm); + case X86::VMOVAPSZ128mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr), + get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); + case X86::VMOVUPSZ128mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr), + get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm); + case X86::VMOVAPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); + case X86::VMOVUPSZ256mr_NOVLX: + return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), + get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::TEST8ri_NOREX: MI.setDesc(get(X86::TEST8ri)); return true; @@ -7086,6 +7181,8 @@ case X86::VMOVSDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: @@ -7096,6 +7193,8 @@ case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: @@ -7159,6 +7258,8 @@ case X86::VMOVSDZrm: case X86::VMOVAPSZ128rm: case X86::VMOVUPSZ128rm: + case X86::VMOVAPSZ128rm_NOVLX: + case X86::VMOVUPSZ128rm_NOVLX: case X86::VMOVAPDZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVDQU8Z128rm: @@ -7169,6 +7270,8 @@ case X86::VMOVDQU64Z128rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPSZ256rm: + case X86::VMOVAPSZ256rm_NOVLX: + case X86::VMOVUPSZ256rm_NOVLX: case X86::VMOVAPDZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVDQU8Z256rm: Index: test/CodeGen/X86/pr29112.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/pr29112.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=x86-64 -mattr=+avx512f | FileCheck %s + +declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>) + +; In AVX512 without VLX we can't spill XMM16-31 with vmovaps as its not available. Instead we need to use vextractf32x4 to spill since it can encode the ZMM super register and can store the lower 128-bits. + +define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) { +; CHECK: vextractf32x4 $0, %zmm16, {{[0-9]+}}(%rsp) ## 16-byte Folded Spill + %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + + %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> + + %r1 = fadd <4 x float> %ay10, %ay9 + %r2 = fadd <4 x float> %ay8, %ay7 + %r3 = fadd <4 x float> %ay6, %ay5 + %r4 = fadd <4 x float> %ay2, %ax10 + %r5 = fadd <4 x float> %ay9, %ax8 + %r6 = fadd <4 x float> %r5, %r3 + %r7 = fadd <4 x float> %a9, %r6 + %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4) + %a12 = fadd <4 x float> %a2, %a1 + %a13 = fadd <4 x float> %a12, %a11 + + ret <4 x float> %a13 +}