Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4505,7 +4505,7 @@ defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, SSE_INTALU_ITINS_P, HasBWI, 0>; defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, - SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD; + SSE_PMULLD_ITINS, HasAVX512, 1>, T8PD; defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, SSE_INTMUL_ITINS_P, HasBWI, 1>; defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -195,7 +195,7 @@ IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; -let Sched = WriteVecIMul in +let Sched = WritePMULLD in def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; Index: lib/Target/X86/X86SchedBroadwell.td =================================================================== --- lib/Target/X86/X86SchedBroadwell.td +++ lib/Target/X86/X86SchedBroadwell.td @@ -163,6 +163,7 @@ defm : BWWriteResPair; // Vector integer ALU op, no logicals. defm : BWWriteResPair; // Vector integer shifts. defm : BWWriteResPair; // Vector integer multiply. +defm : BWWriteResPair; // PMULLD defm : BWWriteResPair; // Vector shuffles. defm : BWWriteResPair; // Vector blends. defm : BWWriteResPair; // Vector variable blends. @@ -2186,13 +2187,6 @@ def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm", "LSL(16|32|64)rm")>; -def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> { - let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[BWWriteResGroup114], (instregex "(V?)PMULLD(Y?)rr")>; - def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> { let Latency = 10; let NumMicroOps = 2; Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -159,6 +159,7 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; @@ -2686,13 +2687,6 @@ } def: InstRW<[HWWriteResGroup117], (instregex "(V?)DPPDrmi")>; -def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> { - let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[HWWriteResGroup118], (instregex "(V?)PMULLD(Y?)rr")>; - def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> { let Latency = 16; let NumMicroOps = 3; Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -146,6 +146,7 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +defm : SBWriteResPair; // TODO this is probably wrong for 256/512-bit for the "generic" model defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -667,7 +668,6 @@ "(V?)PMULHRSWrr", "(V?)PMULHUWrr", "(V?)PMULHWrr", - "(V?)PMULLDrr", "(V?)PMULLWrr", "(V?)PMULUDQrr", "(V?)PSADBWrr")>; @@ -1597,7 +1597,6 @@ "(V?)PMULHRSWrm", "(V?)PMULHUWrm", "(V?)PMULHWrm", - "(V?)PMULLDrm", "(V?)PMULLWrm", "(V?)PMULUDQrm", "(V?)PSADBWrm")>; Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- lib/Target/X86/X86SchedSkylakeClient.td +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -160,6 +160,7 @@ defm : SKLWriteResPair; // Vector integer ALU op, no logicals. defm : SKLWriteResPair; // Vector integer shifts. defm : SKLWriteResPair; // Vector integer multiply. +defm : SKLWriteResPair; defm : SKLWriteResPair; // Vector shuffles. defm : SKLWriteResPair; // Vector blends. defm : SKLWriteResPair; // Vector variable blends. @@ -1855,13 +1856,6 @@ "(V?)ROUNDSDr", "(V?)ROUNDSSr")>; -def SKLWriteResGroup105_2 : SchedWriteRes<[SKLPort01]> { - let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [2]; -} -def: InstRW<[SKLWriteResGroup105_2], (instregex "(V?)PMULLD(Y?)rr")>; - def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> { let Latency = 8; let NumMicroOps = 2; Index: lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- lib/Target/X86/X86SchedSkylakeServer.td +++ lib/Target/X86/X86SchedSkylakeServer.td @@ -160,6 +160,7 @@ defm : SKXWriteResPair; // Vector integer ALU op, no logicals. defm : SKXWriteResPair; // Vector integer shifts. defm : SKXWriteResPair; // Vector integer multiply. +defm : SKXWriteResPair; // Vector integer multiply. defm : SKXWriteResPair; // Vector shuffles. defm : SKXWriteResPair; // Vector blends. defm : SKXWriteResPair; // Vector variable blends. Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -96,6 +96,7 @@ defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. +defm WritePMULLD : X86SchedWritePair; // PMULLD defm WriteShuffle : X86SchedWritePair; // Vector shuffles. defm WriteBlend : X86SchedWritePair; // Vector blends. defm WriteVarBlend : X86SchedWritePair; // Vector variable blends. Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -341,6 +341,7 @@ defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; +defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -133,6 +133,7 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -209,6 +209,7 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // FIXME defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; Index: test/CodeGen/X86/avx2-schedule.ll =================================================================== --- test/CodeGen/X86/avx2-schedule.ll +++ test/CodeGen/X86/avx2-schedule.ll @@ -4911,7 +4911,7 @@ ; GENERIC-LABEL: test_pmulld: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulld: Index: test/CodeGen/X86/slow-pmulld.ll =================================================================== --- test/CodeGen/X86/slow-pmulld.ll +++ test/CodeGen/X86/slow-pmulld.ll @@ -1215,34 +1215,32 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM32-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] -; SLM32-NEXT: pmulld %xmm1, %xmm4 -; SLM32-NEXT: pmulld %xmm1, %xmm0 -; SLM32-NEXT: pmulld %xmm1, %xmm2 -; SLM32-NEXT: pmulld %xmm1, %xmm3 -; SLM32-NEXT: movdqa %xmm4, %xmm1 +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; SLM32-NEXT: pmulld %xmm5, %xmm0 +; SLM32-NEXT: pmulld %xmm5, %xmm2 +; SLM32-NEXT: pmulld %xmm5, %xmm1 +; SLM32-NEXT: pmulld %xmm5, %xmm3 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] -; SLM64-NEXT: pmulld %xmm1, %xmm4 -; SLM64-NEXT: pmulld %xmm1, %xmm0 -; SLM64-NEXT: pmulld %xmm1, %xmm2 -; SLM64-NEXT: pmulld %xmm1, %xmm3 -; SLM64-NEXT: movdqa %xmm4, %xmm1 +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; SLM64-NEXT: pmulld %xmm5, %xmm0 +; SLM64-NEXT: pmulld %xmm5, %xmm2 +; SLM64-NEXT: pmulld %xmm5, %xmm1 +; SLM64-NEXT: pmulld %xmm5, %xmm3 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -4817,8 +4817,8 @@ ; ; SLM-LABEL: test_pmulld: ; SLM: # %bb.0: -; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00] -; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [11:11.00] +; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [14:11.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_pmulld: @@ -4883,14 +4883,14 @@ ; ; BTVER2-SSE-LABEL: test_pmulld: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: pmulld %xmm1, %xmm0 # sched: [2:1.00] -; BTVER2-SSE-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00] +; BTVER2-SSE-NEXT: pmulld %xmm1, %xmm0 # sched: [4:2.00] +; BTVER2-SSE-NEXT: pmulld (%rdi), %xmm0 # sched: [9:2.00] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pmulld: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:2.00] +; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-SSE-LABEL: test_pmulld: Index: test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s +++ test/tools/llvm-mca/X86/BtVer2/pipes-fpu.s @@ -19,7 +19,7 @@ # CHECK: Iterations: 70 # CHECK-NEXT: Instructions: 560 -# CHECK-NEXT: Total Cycles: 4415 +# CHECK-NEXT: Total Cycles: 4416 # CHECK-NEXT: Dispatch Width: 2 # CHECK-NEXT: IPC: 0.13 @@ -33,7 +33,7 @@ # CHECK-NEXT: [6]: HasSideEffects # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 2 1.00 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 3 4 2.00 vpmulld %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 1 0.50 vpand %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 3 1.00 vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: 1 2 1.00 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 @@ -62,12 +62,11 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 3.00 63.00 5.00 5.00 - - - 1.00 0.50 0.50 2.00 - +# CHECK-NEXT: - - - 3.00 63.00 6.01 5.99 - - - 1.00 1.00 1.00 3.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: -# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - - 1.00 - - - - 0.50 0.50 - vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - 2.00 1.00 - - - - 0.03 0.97 2.00 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - 0.01 0.99 - - - - 0.97 0.03 - vpand %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - - 1.00 - - - 1.00 - - - vcvttps2dq %xmm0, %xmm2 # CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm0, %xmm1, %xmm2 @@ -80,20 +79,20 @@ # CHECK-NEXT: 0123456789 0123456789 0123456789 # CHECK-NEXT: Index 0123456789 0123456789 0123456789 01234567 -# CHECK: [0,0] DeeER. . . . . . . . . . . . . . vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,1] DeE-R. . . . . . . . . . . . . . vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,2] .DeeeER . . . . . . . . . . . . . vcvttps2dq %xmm0, %xmm2 -# CHECK-NEXT: [0,3] .DeeE-R . . . . . . . . . . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,4] . DeeeER . . . . . . . . . . . . . vaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . vsqrtps %xmm0, %xmm2 -# CHECK-NEXT: [0,6] . DeeeE-----------------R . . . . . . . . . vaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: [0,7] . D===================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER vsqrtps %ymm0, %ymm2 +# CHECK: [0,0] DeeeeER . . . . . . . . . . . . . vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,1] .DeE--R . . . . . . . . . . . . . vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,2] . DeeeER . . . . . . . . . . . . . vcvttps2dq %xmm0, %xmm2 +# CHECK-NEXT: [0,3] . DeeE-R . . . . . . . . . . . . . vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,4] . DeeeER . . . . . . . . . . . . . vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [0,5] . DeeeeeeeeeeeeeeeeeeeeeER . . . . . . . . . vsqrtps %xmm0, %xmm2 +# CHECK-NEXT: [0,6] . DeeeE-----------------R . . . . . . . . . vaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: [0,7] . D===================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER vsqrtps %ymm0, %ymm2 -# CHECK: [1,0] . DeeE----------------------------------------------------------R vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,1] . DeE-----------------------------------------------------------R vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,2] . .DeeeE--------------------------------------------------------R vcvttps2dq %xmm0, %xmm2 -# CHECK-NEXT: [1,3] . .DeeE---------------------------------------------------------R vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: [1,4] . . DeeeE-------------------------------------------------------R vaddps %xmm0, %xmm1, %xmm2 +# CHECK: [1,0] . .DeeeeE--------------------------------------------------------R vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . . DeE----------------------------------------------------------R vpand %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,2] . . DeeeE-------------------------------------------------------R vcvttps2dq %xmm0, %xmm2 +# CHECK-NEXT: [1,3] . . DeeE--------------------------------------------------------R vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: [1,4] . . DeeeE------------------------------------------------------R vaddps %xmm0, %xmm1, %xmm2 # CHECK: Average Wait times (based on the timeline view): @@ -103,11 +102,11 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 1.0 1.0 29.0 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 0. 2 1.0 1.0 28.0 vpmulld %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1. 2 1.0 1.0 30.0 vpand %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 2. 2 1.0 1.0 28.0 vcvttps2dq %xmm0, %xmm2 -# CHECK-NEXT: 3. 2 1.0 1.0 29.0 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4. 2 1.0 1.0 27.5 vaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 2. 2 1.0 1.0 27.5 vcvttps2dq %xmm0, %xmm2 +# CHECK-NEXT: 3. 2 1.0 1.0 28.5 vpclmulqdq $0, %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4. 2 1.0 1.0 27.0 vaddps %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 5. 1 1.0 1.0 0.0 vsqrtps %xmm0, %xmm2 # CHECK-NEXT: 6. 1 1.0 1.0 17.0 vaddps %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 7. 1 20.0 20.0 0.0 vsqrtps %ymm0, %ymm2 Index: test/tools/llvm-mca/X86/BtVer2/resources-avx1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/resources-avx1.s +++ test/tools/llvm-mca/X86/BtVer2/resources-avx1.s @@ -1518,8 +1518,8 @@ # CHECK-NEXT: 1 7 1.00 * vpmulhuw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 2 1.00 vpmulhw %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 7 1.00 * vpmulhw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 1 2 1.00 vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 1 7 1.00 * vpmulld (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 4 2.00 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 3 9 2.00 * vpmulld (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 2 1.00 vpmullw %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 7 1.00 * vpmullw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 2 1.00 vpmuludq %xmm0, %xmm1, %xmm2 @@ -2221,8 +2221,8 @@ # CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmulhuw (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmulhw %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmulhw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmulld %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmulld (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - - - - 2.50 0.50 - - - - 0.50 0.50 2.00 vpmulld %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - - - - 2.50 0.50 1.00 - - - 0.50 0.50 2.00 vpmulld (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmullw %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 vpmullw (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 vpmuludq %xmm0, %xmm1, %xmm2 Index: test/tools/llvm-mca/X86/BtVer2/resources-sse41.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/resources-sse41.s +++ test/tools/llvm-mca/X86/BtVer2/resources-sse41.s @@ -247,8 +247,8 @@ # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - pmovzxwq (%rax), %xmm2 # CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 pmuldq %xmm0, %xmm2 # CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 pmuldq (%rax), %xmm2 -# CHECK-NEXT: - - - - - 1.00 - - - - - - - 1.00 pmulld %xmm0, %xmm2 -# CHECK-NEXT: - - - - - 1.00 - 1.00 - - - - - 1.00 pmulld (%rax), %xmm2 +# CHECK-NEXT: - - - - - 2.50 0.50 - - - - 0.50 0.50 2.00 pmulld %xmm0, %xmm2 +# CHECK-NEXT: - - - - - 2.50 0.50 1.00 - - - 0.50 0.50 2.00 pmulld (%rax), %xmm2 # CHECK-NEXT: 1.00 - - 1.00 - 1.00 - - - - - - - - ptest %xmm0, %xmm1 # CHECK-NEXT: 1.00 - - 1.00 - 1.00 - 1.00 - - - - - - ptest (%rax), %xmm1 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - roundpd $1, %xmm0, %xmm2