Page MenuHomePhabricator

D25002.diff
No OneTemporary

File Metadata

Created
Jan 24 2020, 4:23 PM

D25002.diff

Index: lib/Target/X86/X86InstrAVX512.td
===================================================================
--- lib/Target/X86/X86InstrAVX512.td
+++ lib/Target/X86/X86InstrAVX512.td
@@ -2800,6 +2800,32 @@
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+// Special instructions to help with spilling when we don't have VLX. We need
+// to load or store from a ZMM register instead. These are converted in
+// expandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>;
+def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>;
+def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
+ "", []>;
+def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
+ "", []>;
+}
+
+let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs f128mem:$dst), (ins VR128X:$src),
+ "", []>;
+def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs f256mem:$dst), (ins VR256X:$src),
+ "", []>;
+def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs f128mem:$dst), (ins VR128X:$src),
+ "", []>;
+def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs f256mem:$dst), (ins VR256X:$src),
+ "", []>;
+}
+
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
(v8i64 VR512:$src))),
(VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)),
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -2645,6 +2645,8 @@
case X86::VMOVAPDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVAPSZrm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA32Z256rm:
@@ -2666,6 +2668,8 @@
case X86::VMOVDQU8Zrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVUPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
@@ -5013,24 +5017,44 @@
// If stack is realigned we can use aligned stores.
if (isStackAligned)
return load ?
- (HasVLX ? X86::VMOVAPSZ128rm : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm):
- (HasVLX ? X86::VMOVAPSZ128mr : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+ (HasVLX ? X86::VMOVAPSZ128rm :
+ HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVAPSrm :
+ X86::MOVAPSrm):
+ (HasVLX ? X86::VMOVAPSZ128mr :
+ HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVAPSmr :
+ X86::MOVAPSmr);
else
return load ?
- (HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm):
- (HasVLX ? X86::VMOVUPSZ128mr : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+ (HasVLX ? X86::VMOVUPSZ128rm :
+ HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+ HasAVX ? X86::VMOVUPSrm :
+ X86::MOVUPSrm):
+ (HasVLX ? X86::VMOVUPSZ128mr :
+ HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+ HasAVX ? X86::VMOVUPSmr :
+ X86::MOVUPSmr);
}
case 32:
assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
// If stack is realigned we can use aligned stores.
if (isStackAligned)
return load ?
- (HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm) :
- (HasVLX ? X86::VMOVAPSZ256mr : X86::VMOVAPSYmr);
+ (HasVLX ? X86::VMOVAPSZ256rm :
+ HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
+ X86::VMOVAPSYrm) :
+ (HasVLX ? X86::VMOVAPSZ256mr :
+ HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
+ X86::VMOVAPSYmr);
else
return load ?
- (HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm) :
- (HasVLX ? X86::VMOVUPSZ256mr : X86::VMOVUPSYmr);
+ (HasVLX ? X86::VMOVUPSZ256rm :
+ HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
+ X86::VMOVUPSYrm) :
+ (HasVLX ? X86::VMOVUPSZ256mr :
+ HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
+ X86::VMOVUPSYmr);
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
@@ -5852,6 +5876,53 @@
MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
}
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that loads the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &LoadDesc,
+ const MCInstrDesc &BroadcastDesc,
+ unsigned SubIdx) {
+ unsigned DestReg = MIB->getOperand(0).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(DestReg) < 16) {
+ // We can use a normal VEX encoded load.
+ MIB->setDesc(LoadDesc);
+ } else {
+ // Use a 128/256-bit VBROADCAST instruction.
+ MIB->setDesc(BroadcastDesc);
+ // Change the destination to a 512-bit register.
+ DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(0).setReg(DestReg);
+ }
+ return true;
+}
+
+// This is used to handle spills for 128/256-bit registers when we have AVX512,
+// but not VLX. If it uses an extended register we need to use an instruction
+// that stores the lower 128/256-bit, but is available with only AVX512F.
+static bool expandNOVLXStore(MachineInstrBuilder &MIB,
+ const TargetRegisterInfo *TRI,
+ const MCInstrDesc &StoreDesc,
+ const MCInstrDesc &ExtractDesc,
+ unsigned SubIdx) {
+ unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ // Check if DestReg is XMM16-31 or YMM16-31.
+ if (TRI->getEncodingValue(SrcReg) < 16) {
+ // We can use a normal VEX encoded store.
+ MIB->setDesc(StoreDesc);
+ } else {
+ // Use a VEXTRACTF instruction.
+ MIB->setDesc(ExtractDesc);
+ // Change the destination to a 512-bit register.
+ SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
+ MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
+ MIB.addImm(0x0); // Append immediate to extract from the lower bits.
+ }
+
+ return true;
+}
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -5899,6 +5970,30 @@
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
+ case X86::VMOVAPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVUPSZ128rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
+ get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
+ case X86::VMOVAPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVUPSZ256rm_NOVLX:
+ return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
+ get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
+ case X86::VMOVAPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVUPSZ128mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
+ get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
+ case X86::VMOVAPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
+ case X86::VMOVUPSZ256mr_NOVLX:
+ return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
+ get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::TEST8ri_NOREX:
MI.setDesc(get(X86::TEST8ri));
return true;
@@ -7086,6 +7181,8 @@
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
@@ -7096,6 +7193,8 @@
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
@@ -7159,6 +7258,8 @@
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
+ case X86::VMOVAPSZ128rm_NOVLX:
+ case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
@@ -7169,6 +7270,8 @@
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
+ case X86::VMOVAPSZ256rm_NOVLX:
+ case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
Index: test/CodeGen/X86/pr29112.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/pr29112.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx512f | FileCheck %s
+
+declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
+
+; In AVX512 without VLX we can't spill XMM16-31 with vmovaps as its not available. Instead we need to use vextractf32x4 to spill since it can encode the ZMM super register and can store the lower 128-bits.
+
+define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
+; CHECK: vextractf32x4 $0, %zmm16, {{[0-9]+}}(%rsp) ## 16-byte Folded Spill
+ %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+
+ %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
+ %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
+ %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
+ %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
+ %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
+ %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
+ %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
+ %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
+ %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
+ %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
+ %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
+ %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
+ %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
+ %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
+ %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>
+
+ %r1 = fadd <4 x float> %ay10, %ay9
+ %r2 = fadd <4 x float> %ay8, %ay7
+ %r3 = fadd <4 x float> %ay6, %ay5
+ %r4 = fadd <4 x float> %ay2, %ax10
+ %r5 = fadd <4 x float> %ay9, %ax8
+ %r6 = fadd <4 x float> %r5, %r3
+ %r7 = fadd <4 x float> %a9, %r6
+ %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
+ %a12 = fadd <4 x float> %a2, %a1
+ %a13 = fadd <4 x float> %a12, %a11
+
+ ret <4 x float> %a13
+}

Event Timeline