Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -7061,27 +7061,29 @@ // multiclass avx_movmask_rm opc_rm, bits<8> opc_mr, string OpcodeStr, Intrinsic IntLd, Intrinsic IntLd256, - Intrinsic IntSt, Intrinsic IntSt256> { + Intrinsic IntSt, Intrinsic IntSt256, + X86SchedWriteMaskMove schedX, + X86SchedWriteMaskMove schedY> { def rm : AVX8I, - VEX_4V, Sched<[WriteFMaskedLoad]>; + VEX_4V, Sched<[schedX.RM]>; def Yrm : AVX8I, - VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; + VEX_4V, VEX_L, Sched<[schedY.RM]>; def mr : AVX8I, - VEX_4V, Sched<[WriteFMaskedStore]>; + VEX_4V, Sched<[schedX.MR]>; def Ymr : AVX8I, - VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; + VEX_4V, VEX_L, Sched<[schedY.MR]>; } let ExeDomain = SSEPackedSingle in @@ -7089,13 +7091,15 @@ int_x86_avx_maskload_ps, int_x86_avx_maskload_ps_256, int_x86_avx_maskstore_ps, - int_x86_avx_maskstore_ps_256>; + int_x86_avx_maskstore_ps_256, + WriteFMaskMove32, WriteFMaskMove32Y>; let ExeDomain = SSEPackedDouble in defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", int_x86_avx_maskload_pd, int_x86_avx_maskload_pd_256, int_x86_avx_maskstore_pd, - int_x86_avx_maskstore_pd_256>; + int_x86_avx_maskstore_pd_256, + WriteFMaskMove64, WriteFMaskMove64Y>; //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values Index: llvm/trunk/lib/Target/X86/X86SchedBroadwell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedBroadwell.td +++ llvm/trunk/lib/Target/X86/X86SchedBroadwell.td @@ -232,8 +232,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: llvm/trunk/lib/Target/X86/X86SchedHaswell.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedHaswell.td +++ llvm/trunk/lib/Target/X86/X86SchedHaswell.td @@ -231,8 +231,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td +++ llvm/trunk/lib/Target/X86/X86SchedSandyBridge.td @@ -208,8 +208,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeClient.td @@ -226,8 +226,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td +++ llvm/trunk/lib/Target/X86/X86SchedSkylakeServer.td @@ -226,8 +226,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: llvm/trunk/lib/Target/X86/X86Schedule.td =================================================================== --- llvm/trunk/lib/Target/X86/X86Schedule.td +++ llvm/trunk/lib/Target/X86/X86Schedule.td @@ -102,6 +102,12 @@ SchedWrite MR = StoreMR; } +// Multiclass that wraps masked load/store writes for a vector width. +class X86SchedWriteMaskMove { + SchedWrite RM = LoadRM; + SchedWrite MR = StoreMR; +} + // Multiclass that wraps X86SchedWriteMoveLS for each vector width. class X86SchedWriteMoveLSWidths; +// Conditional SIMD Packed Loads and Stores wrappers. +def WriteFMaskMove32 + : X86SchedWriteMaskMove; +def WriteFMaskMove64 + : X86SchedWriteMaskMove; +def WriteFMaskMove32Y + : X86SchedWriteMaskMove; +def WriteFMaskMove64Y + : X86SchedWriteMaskMove; + // Vector width wrappers. def SchedWriteFAdd : X86SchedWriteWidths; Index: llvm/trunk/lib/Target/X86/X86ScheduleAtom.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleAtom.td +++ llvm/trunk/lib/Target/X86/X86ScheduleAtom.td @@ -216,8 +216,10 @@ def : WriteRes; def : WriteRes; defm : X86WriteResUnsupported; -defm : X86WriteResUnsupported; -defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; def : WriteRes; def : WriteRes; Index: llvm/trunk/lib/Target/X86/X86ScheduleBdVer2.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleBdVer2.td +++ llvm/trunk/lib/Target/X86/X86ScheduleBdVer2.td @@ -726,8 +726,10 @@ defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; Index: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td +++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td @@ -512,8 +512,11 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; @@ -819,6 +822,18 @@ def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; /////////////////////////////////////////////////////////////////////////////// +// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ +/////////////////////////////////////////////////////////////////////////////// + +def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { + let Latency = 34; + let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; + let NumMicroOps = 63; +} +def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, + VMASKMOVDQU, VMASKMOVDQU64)>; + +/////////////////////////////////////////////////////////////////////////////// // SchedWriteVariant definitions. /////////////////////////////////////////////////////////////////////////////// Index: llvm/trunk/lib/Target/X86/X86ScheduleSLM.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleSLM.td +++ llvm/trunk/lib/Target/X86/X86ScheduleSLM.td @@ -186,8 +186,12 @@ def : WriteRes; def : WriteRes; def : WriteRes; -def : WriteRes; -def : WriteRes; + +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + def : WriteRes; def : WriteRes; def : WriteRes; Index: llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td +++ llvm/trunk/lib/Target/X86/X86ScheduleZnver1.td @@ -268,8 +268,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s @@ -1219,15 +1219,15 @@ # CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %ymm2 # CHECK-NEXT: 1 3 1.00 * U vldmxcsr (%rax) -# CHECK-NEXT: 1 1 1.00 * * U vmaskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 63 34 21.00 * * U vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 1 6 1.00 * vmaskmovpd (%rax), %xmm0, %xmm2 # CHECK-NEXT: 2 6 2.00 * vmaskmovpd (%rax), %ymm0, %ymm2 -# CHECK-NEXT: 1 6 2.00 * * vmaskmovpd %xmm0, %xmm1, (%rax) -# CHECK-NEXT: 2 6 2.00 * * vmaskmovpd %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 10 13 2.00 * * vmaskmovpd %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 18 16 4.00 * * vmaskmovpd %ymm0, %ymm1, (%rax) # CHECK-NEXT: 1 6 1.00 * vmaskmovps (%rax), %xmm0, %xmm2 # CHECK-NEXT: 2 6 2.00 * vmaskmovps (%rax), %ymm0, %ymm2 -# CHECK-NEXT: 1 6 2.00 * * vmaskmovps %xmm0, %xmm1, (%rax) -# CHECK-NEXT: 2 6 2.00 * * vmaskmovps %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 19 16 5.00 * * vmaskmovps %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 36 22 10.00 * * vmaskmovps %ymm0, %ymm1, (%rax) # CHECK-NEXT: 1 2 1.00 vmaxpd %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 7 1.00 * vmaxpd (%rax), %xmm1, %xmm2 # CHECK-NEXT: 2 2 2.00 vmaxpd %ymm0, %ymm1, %ymm2 @@ -1740,7 +1740,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 56.00 - - 365.00 915.00 447.50 461.50 394.00 - 51.00 132.00 135.50 159.50 38.00 +# CHECK-NEXT: 86.00 30.00 - 362.00 907.00 449.50 480.50 414.00 - 78.00 154.00 135.50 159.50 38.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: @@ -1933,15 +1933,15 @@ # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %xmm2 # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %ymm2 # CHECK-NEXT: - - - - - - - 1.00 - - - - - - vldmxcsr (%rax) -# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmaskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 21.00 21.00 - 1.00 - 1.00 2.00 2.00 - 16.00 2.00 - - - vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovpd (%rax), %xmm0, %xmm2 # CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovpd (%rax), %ymm0, %ymm2 -# CHECK-NEXT: - - - 2.00 2.00 0.50 0.50 - - 1.00 - - - - vmaskmovpd %xmm0, %xmm1, (%rax) -# CHECK-NEXT: - - - 2.00 2.00 1.00 1.00 - - 2.00 - - - - vmaskmovpd %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 1.00 1.00 - 1.00 - 1.00 2.00 2.00 - 2.00 2.00 - - - vmaskmovpd %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 2.00 2.00 - 1.00 - 1.00 4.00 4.00 - 4.00 4.00 - - - vmaskmovpd %ymm0, %ymm1, (%rax) # CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovps (%rax), %xmm0, %xmm2 # CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovps (%rax), %ymm0, %ymm2 -# CHECK-NEXT: - - - 2.00 2.00 0.50 0.50 - - 1.00 - - - - vmaskmovps %xmm0, %xmm1, (%rax) -# CHECK-NEXT: - - - 2.00 2.00 1.00 1.00 - - 2.00 - - - - vmaskmovps %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 2.00 2.00 - 1.00 - 1.00 5.00 4.00 - 4.00 5.00 - - - vmaskmovps %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 4.00 4.00 - 1.00 - 1.00 10.00 8.00 - 8.00 10.00 - - - vmaskmovps %ymm0, %ymm1, (%rax) # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmaxpd %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmaxpd (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vmaxpd %ymm0, %ymm1, %ymm2 Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s @@ -465,7 +465,7 @@ # CHECK-NEXT: 1 19 19.00 divsd %xmm0, %xmm2 # CHECK-NEXT: 1 24 19.00 * divsd (%rax), %xmm2 # CHECK-NEXT: 1 1 1.00 * * U lfence -# CHECK-NEXT: 1 1 1.00 * * U maskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 63 34 21.00 * * U maskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 1 2 1.00 maxpd %xmm0, %xmm2 # CHECK-NEXT: 1 7 1.00 * maxpd (%rax), %xmm2 # CHECK-NEXT: 1 2 1.00 maxsd %xmm0, %xmm2 @@ -693,7 +693,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 17.00 - - 49.00 204.00 128.50 141.50 118.00 - 16.00 54.00 67.50 67.50 12.00 +# CHECK-NEXT: 38.00 21.00 - 50.00 204.00 129.50 142.50 120.00 - 31.00 55.00 67.50 67.50 12.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: @@ -755,7 +755,7 @@ # CHECK-NEXT: - - - - 19.00 - 1.00 - - - - - - - divsd %xmm0, %xmm2 # CHECK-NEXT: - - - - 19.00 - 1.00 1.00 - - - - - - divsd (%rax), %xmm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - lfence -# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - maskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 21.00 21.00 - 1.00 - 1.00 2.00 2.00 - 16.00 2.00 - - - maskmovdqu %xmm0, %xmm1 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxpd %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - maxpd (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxsd %xmm0, %xmm2