diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1503,7 +1503,7 @@ def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), []>, T8XD, VEX_4V, - Sched<[sched.Folded, WriteIMulH, + Sched<[sched.Folded, WriteIMulHLd, // Memory operand. ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, // Implicit read of EDX/RDX diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -149,7 +149,10 @@ defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; -def : WriteRes { let Latency = 3; } +def BWWriteIMulH : WriteRes { let Latency = 3; } +def : WriteRes { + let Latency = !add(BWWriteIMulH.Latency, BroadwellModel.LoadLatency); +} defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -151,7 +151,10 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -def : WriteRes { let Latency = 3; } +def HWWriteIMulH : WriteRes { let Latency = 3; } +def : WriteRes { + let Latency = !add(HWWriteIMulH.Latency, HaswellModel.LoadLatency); +} defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -131,7 +131,10 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -def : WriteRes { let Latency = 3; } +def SBWriteIMulH : WriteRes { let Latency = 3; } +def : WriteRes { + let Latency = !add(SBWriteIMulH.Latency, SandyBridgeModel.LoadLatency); +} defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -129,7 +129,10 @@ defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; -def : WriteRes { let Latency = 3; } +def SKLWriteIMulH : WriteRes { let Latency = 3; } +def : WriteRes { + let Latency = !add(SKLWriteIMulH.Latency, SkylakeClientModel.LoadLatency); +} defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -130,7 +130,10 @@ defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; -def : WriteRes { let Latency = 3; } +def SKXWriteIMulH : WriteRes { let Latency = 3; } +def : WriteRes { + let Latency = !add(SKXWriteIMulH.Latency, SkylakeServerModel.LoadLatency); +} defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -150,7 +150,8 @@ defm WriteIMul64Reg : X86SchedWritePair; // Integer 64-bit multiplication by register. defm WriteMULX32 : X86SchedWritePair; // Integer 32-bit Multiplication without affecting flags. defm WriteMULX64 : X86SchedWritePair; // Integer 64-bit Multiplication without affecting flags. -def WriteIMulH : SchedWrite; // Integer multiplication, high part (only used by MULX). +def WriteIMulH : SchedWrite; // Integer multiplication, high part (only used by the RR variant of MULX). +def WriteIMulHLd : SchedWrite; // Integer multiplication, high part (only used by the RM variant of MULX). def WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap. def WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap. diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -91,6 +91,7 @@ defm : AtomWriteResPair; defm : AtomWriteResPair; defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -438,6 +438,7 @@ // BMI2 MULX defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -210,6 +210,7 @@ defm : JWriteResIntPair; defm : JWriteResIntPair; defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -112,6 +112,7 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; defm : X86WriteResPairUnsupported; defm : X86WriteResPairUnsupported; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -256,10 +256,14 @@ defm : ZnWriteResPair; // IMULH -def : WriteRes{ +def ZnWriteIMulH : WriteRes{ let Latency = 3; let NumMicroOps = 0; } +def : WriteRes { + let Latency = !add(ZnWriteIMulH.Latency, Znver1Model.LoadLatency); + let NumMicroOps = ZnWriteIMulH.NumMicroOps; +} // Floating point operations defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -243,11 +243,17 @@ defm : Zn2WriteResPair; // IMULH -def : WriteRes{ +def Zn2WriteIMulH : WriteRes{ let Latency = 3; let NumMicroOps = 0; } +def : WriteRes{ + let Latency = !add(Zn2WriteIMulH.Latency, Znver2Model.LoadLatency); + let NumMicroOps = Zn2WriteIMulH.NumMicroOps; +} + + // Floating point operations defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -631,7 +631,7 @@ let ResourceCycles = [1, 1, 2]; let NumMicroOps = Zn3MULX32rr.NumMicroOps; } -def : InstRW<[Zn3MULX32rm, WriteIMulH, +def : InstRW<[Zn3MULX32rm, WriteIMulHLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadAfterLd], (instrs MULX32rm)>; @@ -652,13 +652,14 @@ let ResourceCycles = [1, 1, 2]; let NumMicroOps = Zn3MULX64rr.NumMicroOps; } -def : InstRW<[Zn3MULX64rm, WriteIMulH, +def : InstRW<[Zn3MULX64rm, WriteIMulHLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadAfterLd], (instrs MULX64rm)>; defm : Zn3WriteResIntPair; // Integer 64-bit multiplication by immediate. defm : Zn3WriteResIntPair; // Integer 64-bit multiplication by register. -defm : Zn3WriteResInt; // Integer multiplication, high part. +defm : Zn3WriteResInt; // Integer multiplication, high part. +defm : Zn3WriteResInt; // Integer multiplication, high part. defm : Zn3WriteResInt; // Byte Order (Endianness) 32-bit Swap. defm : Zn3WriteResInt; // Byte Order (Endianness) 64-bit Swap. diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Haswell/mulx-hi-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Haswell/mulx-hi-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Haswell/mulx-hi-read-advance.s @@ -63,7 +63,7 @@ # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeeER mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: [0,1] .D==eE-----R addl %eax, %eax +# CHECK-NEXT: [0,1] .D=======eER addl %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -73,8 +73,8 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: 1. 1 3.0 0.0 5.0 addl %eax, %eax -# CHECK-NEXT: 1 2.0 0.5 2.5 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 addl %eax, %eax +# CHECK-NEXT: 1 4.5 0.5 0.0 # CHECK: [1] Code Region @@ -126,7 +126,7 @@ # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeeER mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: [0,1] .D==eE-----R addq %rax, %rax +# CHECK-NEXT: [0,1] .D=======eER addq %rax, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -136,5 +136,5 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: 1. 1 3.0 0.0 5.0 addq %rax, %rax -# CHECK-NEXT: 1 2.0 0.5 2.5 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 addq %rax, %rax +# CHECK-NEXT: 1 4.5 0.5 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-hi-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-hi-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-hi-read-advance.s @@ -63,7 +63,7 @@ # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeeER mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: [0,1] D===eE-----R addl %eax, %eax +# CHECK-NEXT: [0,1] D========eER addl %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -73,8 +73,8 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: 1. 1 4.0 0.0 5.0 addl %eax, %eax -# CHECK-NEXT: 1 2.5 0.5 2.5 +# CHECK-NEXT: 1. 1 9.0 0.0 0.0 addl %eax, %eax +# CHECK-NEXT: 1 5.0 0.5 0.0 # CHECK: [1] Code Region @@ -126,7 +126,7 @@ # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DeeeeeeeeeER mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: [0,1] D===eE-----R addq %rax, %rax +# CHECK-NEXT: [0,1] D========eER addq %rax, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -136,5 +136,5 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: 1. 1 4.0 0.0 5.0 addq %rax, %rax -# CHECK-NEXT: 1 2.5 0.5 2.5 +# CHECK-NEXT: 1. 1 9.0 0.0 0.0 addq %rax, %rax +# CHECK-NEXT: 1 5.0 0.5 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver2/mulx-hi-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Znver2/mulx-hi-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Znver2/mulx-hi-read-advance.s @@ -17,12 +17,12 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Total uOps: 2 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.20 -# CHECK-NEXT: IPC: 0.20 +# CHECK-NEXT: uOps Per Cycle: 0.18 +# CHECK-NEXT: IPC: 0.18 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -62,10 +62,11 @@ # CHECK-NEXT: - - - - - - 1.00 - - - - - - addl %eax, %eax # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: [0,1] D===eE---R addl %eax, %eax +# CHECK: [0,0] DeeeeeeeER. mulxl (%rdi), %eax, %ecx +# CHECK-NEXT: [0,1] D=======eER addl %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -75,19 +76,19 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: 1. 1 4.0 0.0 3.0 addl %eax, %eax -# CHECK-NEXT: 1 2.5 0.5 1.5 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 addl %eax, %eax +# CHECK-NEXT: 1 4.5 0.5 0.0 # CHECK: [1] Code Region # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Total uOps: 2 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.20 -# CHECK-NEXT: IPC: 0.20 +# CHECK-NEXT: uOps Per Cycle: 0.18 +# CHECK-NEXT: IPC: 0.18 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -127,10 +128,11 @@ # CHECK-NEXT: - - - - - - 1.00 - - - - - - addq %rax, %rax # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: [0,1] D===eE---R addq %rax, %rax +# CHECK: [0,0] DeeeeeeeER. mulxq (%rdi), %rax, %rcx +# CHECK-NEXT: [0,1] D=======eER addq %rax, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -140,5 +142,5 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: 1. 1 4.0 0.0 3.0 addq %rax, %rax -# CHECK-NEXT: 1 2.5 0.5 1.5 +# CHECK-NEXT: 1. 1 8.0 0.0 0.0 addq %rax, %rax +# CHECK-NEXT: 1 4.5 0.5 0.0 diff --git a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Znver3/mulx-hi-read-advance.s @@ -17,12 +17,12 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 11 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 3 # CHECK: Dispatch Width: 6 -# CHECK-NEXT: uOps Per Cycle: 0.27 -# CHECK-NEXT: IPC: 0.18 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.17 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -72,11 +72,11 @@ # CHECK-NEXT: - - - - - - 1.00 - - - - - - - - - - - - - - - - addl %eax, %eax # CHECK: Timeline view: -# CHECK-NEXT: 0 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: [0,1] D====eE---R addl %eax, %eax +# CHECK: [0,0] DeeeeeeeeER. mulxl (%rdi), %eax, %ecx +# CHECK-NEXT: [0,1] D========eER addl %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -86,19 +86,19 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxl (%rdi), %eax, %ecx -# CHECK-NEXT: 1. 1 5.0 0.0 3.0 addl %eax, %eax -# CHECK-NEXT: 1 3.0 0.5 1.5 +# CHECK-NEXT: 1. 1 9.0 0.0 0.0 addl %eax, %eax +# CHECK-NEXT: 1 5.0 0.5 0.0 # CHECK: [1] Code Region # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 11 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 3 # CHECK: Dispatch Width: 6 -# CHECK-NEXT: uOps Per Cycle: 0.27 -# CHECK-NEXT: IPC: 0.18 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.17 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -148,11 +148,11 @@ # CHECK-NEXT: - - - - - - 1.00 - - - - - - - - - - - - - - - - addq %rax, %rax # CHECK: Timeline view: -# CHECK-NEXT: 0 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeER mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: [0,1] D====eE---R addq %rax, %rax +# CHECK: [0,0] DeeeeeeeeER. mulxq (%rdi), %rax, %rcx +# CHECK-NEXT: [0,1] D========eER addq %rax, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -162,5 +162,5 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 mulxq (%rdi), %rax, %rcx -# CHECK-NEXT: 1. 1 5.0 0.0 3.0 addq %rax, %rax -# CHECK-NEXT: 1 3.0 0.5 1.5 +# CHECK-NEXT: 1. 1 9.0 0.0 0.0 addq %rax, %rax +# CHECK-NEXT: 1 5.0 0.5 0.0