diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -9677,7 +9677,8 @@ (ins _.RC:$src1, MaskRC:$mask, memop:$src2), !strconcat(OpcodeStr#_.Suffix, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; + []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; } multiclass avx512_gather_q_pd dopc, bits<8> qopc, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7882,12 +7882,12 @@ (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX, Sched<[WriteLoad]>; + []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; def Yrm : AVX28I, VEX, VEX_L, Sched<[WriteLoad]>; + []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; } } diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -206,6 +206,10 @@ defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -125,6 +125,10 @@ defm : X86WriteRes; def : WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Arithmetic. defm : HWWriteResPair; defm : HWWriteResPair; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -112,6 +112,7 @@ def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; +def : WriteRes { let Latency = 5; let NumMicroOps = 0; } // Arithmetic. defm : SBWriteResPair; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -203,6 +203,10 @@ defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -203,6 +203,10 @@ defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -125,6 +125,7 @@ def WriteStore : SchedWrite; def WriteStoreNT : SchedWrite; def WriteMove : SchedWrite; +def WriteVecMaskedGatherWriteback : SchedWrite; def WriteCopy : WriteSequence<[WriteLoad, WriteStore]>; // mem->mem copy // Arithmetic. diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -165,6 +165,7 @@ def : WriteRes; def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -269,6 +269,7 @@ def : WriteRes; def : WriteRes; def : WriteRes { let ResourceCycles = [2]; } +defm : X86WriteResUnsupported; // Load/store MXCSR. // FIXME: These are copy and pasted from WriteLoad/Store. diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -273,6 +273,7 @@ def : WriteRes; def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Load/store MXCSR. def : WriteRes { let Latency = 3; } diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -88,6 +88,7 @@ def : WriteRes { let Latency = 3; } def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Load/store MXCSR. // FIXME: These are probably wrong. They are copy pasted from WriteStore/Load. diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -179,6 +179,10 @@ def : WriteRes; def : WriteRes { let Latency = 8; } +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +def : WriteRes { let Latency = 8; let NumMicroOps = 0; } + def : WriteRes; def : WriteRes; defm : ZnWriteResPair; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -178,6 +178,10 @@ def : WriteRes; def : WriteRes { let Latency = 8; } +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +def : WriteRes { let Latency = 8; let NumMicroOps = 0; } + def : WriteRes; def : WriteRes; defm : Zn2WriteResPair; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -495,6 +495,10 @@ // Loads, stores, and moves, not folded with other operations. defm : Zn3WriteResInt; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : Zn3WriteResInt; + def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { let Latency = !add(Znver3Model.LoadLatency, 1); let ResourceCycles = [3, 1];