Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -4505,7 +4505,7 @@ defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, SSE_INTALU_ITINS_P, HasBWI, 0>; defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, - SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD; + SSE_PMULLD_ITINS, HasAVX512, 1>, T8PD; defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, SSE_INTMUL_ITINS_P, HasBWI, 1>; defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -195,7 +195,7 @@ IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; -let Sched = WriteVecIMul in +let Sched = WritePMULLD in def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; Index: lib/Target/X86/X86SchedBroadwell.td =================================================================== --- lib/Target/X86/X86SchedBroadwell.td +++ lib/Target/X86/X86SchedBroadwell.td @@ -163,6 +163,7 @@ defm : BWWriteResPair; // Vector integer ALU op, no logicals. defm : BWWriteResPair; // Vector integer shifts. defm : BWWriteResPair; // Vector integer multiply. +defm : BWWriteResPair; // PMULLD defm : BWWriteResPair; // Vector shuffles. defm : BWWriteResPair; // Vector blends. defm : BWWriteResPair; // Vector variable blends. Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -159,6 +159,7 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -146,6 +146,7 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +defm : SBWriteResPair; // TODO this is probably wrong for 256/512-bit for the "generic" model defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; @@ -667,7 +668,6 @@ "(V?)PMULHRSWrr", "(V?)PMULHUWrr", "(V?)PMULHWrr", - "(V?)PMULLDrr", "(V?)PMULLWrr", "(V?)PMULUDQrr", "(V?)PSADBWrr")>; @@ -1597,7 +1597,6 @@ "(V?)PMULHRSWrm", "(V?)PMULHUWrm", "(V?)PMULHWrm", - "(V?)PMULLDrm", "(V?)PMULLWrm", "(V?)PMULUDQrm", "(V?)PSADBWrm")>; Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- lib/Target/X86/X86SchedSkylakeClient.td +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -160,6 +160,7 @@ defm : SKLWriteResPair; // Vector integer ALU op, no logicals. defm : SKLWriteResPair; // Vector integer shifts. defm : SKLWriteResPair; // Vector integer multiply. +defm : SKLWriteResPair; defm : SKLWriteResPair; // Vector shuffles. defm : SKLWriteResPair; // Vector blends. defm : SKLWriteResPair; // Vector variable blends. Index: lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- lib/Target/X86/X86SchedSkylakeServer.td +++ lib/Target/X86/X86SchedSkylakeServer.td @@ -160,6 +160,7 @@ defm : SKXWriteResPair; // Vector integer ALU op, no logicals. defm : SKXWriteResPair; // Vector integer shifts. defm : SKXWriteResPair; // Vector integer multiply. +defm : SKXWriteResPair; // Vector integer multiply. defm : SKXWriteResPair; // Vector shuffles. defm : SKXWriteResPair; // Vector blends. defm : SKXWriteResPair; // Vector variable blends. Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -96,6 +96,7 @@ defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. +defm WritePMULLD : X86SchedWritePair; // PMULLD defm WriteShuffle : X86SchedWritePair; // Vector shuffles. defm WriteBlend : X86SchedWritePair; // Vector blends. defm WriteVarBlend : X86SchedWritePair; // Vector variable blends. Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -341,6 +341,7 @@ defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; +defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; defm : JWriteResFpuPair; Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -133,6 +133,7 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -209,6 +209,7 @@ defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; +defm : ZnWriteResFpuPair; // FIXME defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; Index: test/CodeGen/X86/avx2-schedule.ll =================================================================== --- test/CodeGen/X86/avx2-schedule.ll +++ test/CodeGen/X86/avx2-schedule.ll @@ -4911,7 +4911,7 @@ ; GENERIC-LABEL: test_pmulld: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00] -; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [9:1.00] +; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_pmulld: Index: test/CodeGen/X86/slow-pmulld.ll =================================================================== --- test/CodeGen/X86/slow-pmulld.ll +++ test/CodeGen/X86/slow-pmulld.ll @@ -1215,34 +1215,32 @@ define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM32-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] -; SLM32-NEXT: pmulld %xmm1, %xmm4 -; SLM32-NEXT: pmulld %xmm1, %xmm0 -; SLM32-NEXT: pmulld %xmm1, %xmm2 -; SLM32-NEXT: pmulld %xmm1, %xmm3 -; SLM32-NEXT: movdqa %xmm4, %xmm1 +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; SLM32-NEXT: pmulld %xmm5, %xmm0 +; SLM32-NEXT: pmulld %xmm5, %xmm2 +; SLM32-NEXT: pmulld %xmm5, %xmm1 +; SLM32-NEXT: pmulld %xmm5, %xmm3 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] +; SLM64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] -; SLM64-NEXT: pmulld %xmm1, %xmm4 -; SLM64-NEXT: pmulld %xmm1, %xmm0 -; SLM64-NEXT: pmulld %xmm1, %xmm2 -; SLM64-NEXT: pmulld %xmm1, %xmm3 -; SLM64-NEXT: movdqa %xmm4, %xmm1 +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; SLM64-NEXT: pmulld %xmm5, %xmm0 +; SLM64-NEXT: pmulld %xmm5, %xmm2 +; SLM64-NEXT: pmulld %xmm5, %xmm1 +; SLM64-NEXT: pmulld %xmm5, %xmm3 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -4817,8 +4817,8 @@ ; ; SLM-LABEL: test_pmulld: ; SLM: # %bb.0: -; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00] -; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [11:11.00] +; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [14:11.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_pmulld: @@ -4883,14 +4883,14 @@ ; ; BTVER2-SSE-LABEL: test_pmulld: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: pmulld %xmm1, %xmm0 # sched: [2:1.00] -; BTVER2-SSE-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00] +; BTVER2-SSE-NEXT: pmulld %xmm1, %xmm0 # sched: [4:2.00] +; BTVER2-SSE-NEXT: pmulld (%rdi), %xmm0 # sched: [9:2.00] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pmulld: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] -; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:2.00] +; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:2.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-SSE-LABEL: test_pmulld: