diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1502,8 +1502,12 @@ let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - - []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>; + []>, T8XD, VEX_4V, + Sched<[sched.Folded, WriteIMulH, + // Memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Implicit read of EDX/RDX + sched.ReadAfterFold]>; // Pseudo instructions to be used when the low result isn't used. The // instruction is defined to keep the high if both destinations are the same. @@ -1518,9 +1522,9 @@ let Predicates = [HasBMI2] in { let Uses = [EDX] in - defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul32>; + defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>; let Uses = [RDX] in - defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W; + defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, VEX_W; } //===----------------------------------------------------------------------===// @@ -1547,7 +1551,12 @@ "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS; } // SchedRW - let mayLoad = 1, SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold] in { + let mayLoad = 1, + SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold, + // Memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Implicit read of EFLAGS + WriteADC.ReadAfterFold] in { def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), "adcx{l}\t{$src2, $dst|$dst, $src2}", []>, T8PD; diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -123,9 +123,11 @@ defm : X86WriteRes; defm : BWWriteResPair; defm : BWWriteResPair; +defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; +defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; def : WriteRes { let Latency = 3; } diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -140,9 +140,11 @@ defm : X86WriteRes; defm : HWWriteResPair; defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; def : WriteRes { let Latency = 3; } diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -124,9 +124,11 @@ defm : X86WriteRes; defm : SBWriteResPair; defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; def : WriteRes { let Latency = 3; } diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -122,9 +122,11 @@ defm : X86WriteRes; defm : SKLWriteResPair; defm : SKLWriteResPair; +defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; +defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; def : WriteRes { let Latency = 3; } diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -123,9 +123,11 @@ defm : X86WriteRes; defm : X86WriteRes; defm : SKXWriteResPair; +defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; +defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; def : WriteRes { let Latency = 3; } diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -148,7 +148,9 @@ defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication. defm WriteIMul64Imm : X86SchedWritePair; // Integer 64-bit multiplication by immediate. defm WriteIMul64Reg : X86SchedWritePair; // Integer 64-bit multiplication by register. -def WriteIMulH : SchedWrite; // Integer multiplication, high part. +defm WriteMULX32 : X86SchedWritePair; // Integer 32-bit Multiplication without affecting flags. +defm WriteMULX64 : X86SchedWritePair; // Integer 64-bit Multiplication without affecting flags. +def WriteIMulH : SchedWrite; // Integer multiplication, high part (only used by MULX). def WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap. def WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap. diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -91,6 +91,8 @@ defm : AtomWriteResPair; defm : AtomWriteResPair; defm : X86WriteResUnsupported; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -435,7 +435,11 @@ defm : PdWriteResExPair; defm : PdWriteResExPair; defm : PdWriteResExPair; -defm : X86WriteResUnsupported; // BMI2 MULX + +// BMI2 MULX +defm : X86WriteResUnsupported; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; defm : PdWriteResExPair; defm : PdWriteResExPair; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -209,7 +209,9 @@ defm : JWriteResIntPair; defm : JWriteResIntPair; defm : JWriteResIntPair; -defm : X86WriteRes; +defm : X86WriteResUnsupported; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; defm : JWriteResIntPair; defm : JWriteResIntPair; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -111,7 +111,9 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -def : WriteRes; +defm : X86WriteResUnsupported; +defm : X86WriteResPairUnsupported; +defm : X86WriteResPairUnsupported; defm : X86WriteRes; defm : X86WriteRes; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -256,8 +256,9 @@ defm : ZnWriteResPair; // IMULH -def : WriteRes{ - let Latency = 4; +def : WriteRes{ + let Latency = 3; + let NumMicroOps = 0; } // Floating point operations @@ -659,32 +660,10 @@ } def : SchedAlias; -// MULX. -// r32,r32,r32. -def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { - let Latency = 3; - let ResourceCycles = [1, 2]; -} -def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>; - -// r32,r32,m32. -def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { - let Latency = 8; - let ResourceCycles = [1, 2, 2]; -} -def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>; - -// r64,r64,r64. -def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> { - let Latency = 3; -} -def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>; - -// r64,r64,m64. -def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { - let Latency = 8; -} -def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>; +// MULX +// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies. +defm : ZnWriteResPair; +defm : ZnWriteResPair; //-- Control transfer instructions --// diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -243,8 +243,9 @@ defm : Zn2WriteResPair; // IMULH -def : WriteRes{ - let Latency = 4; +def : WriteRes{ + let Latency = 3; + let NumMicroOps = 0; } // Floating point operations @@ -658,31 +659,9 @@ def : SchedAlias; // MULX. -// r32,r32,r32. -def Zn2WriteMulX32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { - let Latency = 3; - let ResourceCycles = [1, 2]; -} -def : InstRW<[Zn2WriteMulX32], (instrs MULX32rr)>; - -// r32,r32,m32. -def Zn2WriteMulX32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { - let Latency = 7; - let ResourceCycles = [1, 2, 2]; -} -def : InstRW<[Zn2WriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>; - -// r64,r64,r64. -def Zn2WriteMulX64 : SchedWriteRes<[Zn2ALU1]> { - let Latency = 3; -} -def : InstRW<[Zn2WriteMulX64], (instrs MULX64rr)>; - -// r64,r64,m64. -def Zn2WriteMulX64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> { - let Latency = 7; -} -def : InstRW<[Zn2WriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>; +// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies. +defm : Zn2WriteResPair; +defm : Zn2WriteResPair; //-- Control transfer instructions --// diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -617,6 +617,7 @@ defm : Zn3WriteResIntPair; // Integer 16-bit multiplication by immediate. defm : Zn3WriteResIntPair; // Integer 16-bit multiplication by register. defm : Zn3WriteResIntPair; // Integer 32-bit multiplication. +defm : Zn3WriteResIntPair; // Integer 32-bit Unsigned Multiply Without Affecting Flags. def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> { let Latency = 4; @@ -630,11 +631,14 @@ let ResourceCycles = [1, 1, 2]; let NumMicroOps = Zn3MULX32rr.NumMicroOps; } -def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>; +def : InstRW<[Zn3MULX32rm, WriteIMulH, + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadAfterLd], (instrs MULX32rm)>; defm : Zn3WriteResIntPair; // Integer 32-bit multiplication by immediate. defm : Zn3WriteResIntPair; // Integer 32-bit multiplication by register. defm : Zn3WriteResIntPair; // Integer 64-bit multiplication. +defm : Zn3WriteResIntPair; // Integer 32-bit Unsigned Multiply Without Affecting Flags. def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> { let Latency = 4; @@ -648,7 +652,9 @@ let ResourceCycles = [1, 1, 2]; let NumMicroOps = Zn3MULX64rr.NumMicroOps; } -def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>; +def : InstRW<[Zn3MULX64rm, WriteIMulH, + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadAfterLd], (instrs MULX64rm)>; defm : Zn3WriteResIntPair; // Integer 64-bit multiplication by immediate. defm : Zn3WriteResIntPair; // Integer 64-bit multiplication by register. diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s b/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s @@ -15,12 +15,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 17 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 6 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.35 -# CHECK-NEXT: IPC: 0.12 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.17 # CHECK-NEXT: Block RThroughput: 0.8 # CHECK: Instruction Info: @@ -55,11 +55,11 @@ # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 0.50 0.50 - adcxq (%rdi), %rcx # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. .. adcxq (%rdi), %rcx -# CHECK-NEXT: [1,0] .D======eeeeeeeER adcxq (%rdi), %rcx +# CHECK: [0,0] DeeeeeeeER.. adcxq (%rdi), %rcx +# CHECK-NEXT: [1,0] .D=eeeeeeeER adcxq (%rdi), %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -68,18 +68,18 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.0 0.5 0.0 adcxq (%rdi), %rcx +# CHECK-NEXT: 0. 2 1.5 0.5 0.0 adcxq (%rdi), %rcx # CHECK: [1] Code Region # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 17 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 6 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.35 -# CHECK-NEXT: IPC: 0.12 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.17 # CHECK-NEXT: Block RThroughput: 0.8 # CHECK: Instruction Info: @@ -114,11 +114,11 @@ # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 0.50 0.50 - adoxq (%rdi), %rcx # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. .. adoxq (%rdi), %rcx -# CHECK-NEXT: [1,0] .D======eeeeeeeER adoxq (%rdi), %rcx +# CHECK: [0,0] DeeeeeeeER.. adoxq (%rdi), %rcx +# CHECK-NEXT: [1,0] .D=eeeeeeeER adoxq (%rdi), %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -127,4 +127,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.0 0.5 0.0 adoxq (%rdi), %rcx +# CHECK-NEXT: 0. 2 1.5 0.5 0.0 adoxq (%rdi), %rcx diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s @@ -15,12 +15,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total Cycles: 16 # CHECK-NEXT: Total uOps: 10 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.48 -# CHECK-NEXT: IPC: 0.10 +# CHECK-NEXT: uOps Per Cycle: 0.63 +# CHECK-NEXT: IPC: 0.13 # CHECK-NEXT: Block RThroughput: 1.3 # CHECK: Instruction Info: @@ -55,11 +55,11 @@ # CHECK-NEXT: - - 0.50 1.00 0.50 0.50 - 0.50 1.00 - mulxl (%rdi), %eax, %edx # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0 +# CHECK-NEXT: 012345 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeeER . . mulxl (%rdi), %eax, %edx -# CHECK-NEXT: [1,0] . D=======eeeeeeeeeER mulxl (%rdi), %eax, %edx +# CHECK: [0,0] DeeeeeeeeeER . mulxl (%rdi), %eax, %edx +# CHECK-NEXT: [1,0] . D==eeeeeeeeeER mulxl (%rdi), %eax, %edx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -68,18 +68,18 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mulxl (%rdi), %eax, %edx +# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl (%rdi), %eax, %edx # CHECK: [1] Code Region # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 21 +# CHECK-NEXT: Total Cycles: 16 # CHECK-NEXT: Total uOps: 8 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.38 -# CHECK-NEXT: IPC: 0.10 +# CHECK-NEXT: uOps Per Cycle: 0.50 +# CHECK-NEXT: IPC: 0.13 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -114,11 +114,11 @@ # CHECK-NEXT: - - - 1.00 0.50 0.50 - - 1.00 - mulxq (%rdi), %rax, %rdx # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 -# CHECK-NEXT: Index 0123456789 0 +# CHECK-NEXT: 012345 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeeeER . . mulxq (%rdi), %rax, %rdx -# CHECK-NEXT: [1,0] .D========eeeeeeeeeER mulxq (%rdi), %rax, %rdx +# CHECK: [0,0] DeeeeeeeeeER . mulxq (%rdi), %rax, %rdx +# CHECK-NEXT: [1,0] .D===eeeeeeeeeER mulxq (%rdi), %rax, %rdx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -127,4 +127,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 5.0 0.5 0.0 mulxq (%rdi), %rax, %rdx +# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq (%rdi), %rax, %rdx diff --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s --- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s +++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s @@ -64,8 +64,8 @@ # CHECK-NEXT: 2 5 0.50 * bzhiq %rax, (%rbx), %rcx # CHECK-NEXT: 1 3 2.00 mulxl %eax, %ebx, %ecx # CHECK-NEXT: 1 8 2.00 * mulxl (%rax), %ebx, %ecx -# CHECK-NEXT: 1 3 1.00 mulxq %rax, %rbx, %rcx -# CHECK-NEXT: 1 8 1.00 * mulxq (%rax), %rbx, %rcx +# CHECK-NEXT: 1 3 2.00 mulxq %rax, %rbx, %rcx +# CHECK-NEXT: 1 8 2.00 * mulxq (%rax), %rbx, %rcx # CHECK-NEXT: 1 100 0.25 pdepl %eax, %ebx, %ecx # CHECK-NEXT: 1 100 0.25 * pdepl (%rax), %ebx, %ecx # CHECK-NEXT: 1 100 0.25 pdepq %rax, %rbx, %rcx @@ -107,7 +107,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: 6.00 6.00 5.00 10.00 5.00 5.00 - - - - - 5.00 +# CHECK-NEXT: 6.00 6.00 5.00 9.00 5.00 5.00 - - - - - 8.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -116,9 +116,9 @@ # CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - bzhiq %rax, %rbx, %rcx # CHECK-NEXT: 0.50 0.50 0.25 0.25 0.25 0.25 - - - - - - bzhiq %rax, (%rbx), %rcx # CHECK-NEXT: - - - 1.00 - - - - - - - 2.00 mulxl %eax, %ebx, %ecx -# CHECK-NEXT: 0.50 0.50 - 2.00 - - - - - - - 2.00 mulxl (%rax), %ebx, %ecx -# CHECK-NEXT: - - - 1.00 - - - - - - - - mulxq %rax, %rbx, %rcx -# CHECK-NEXT: 0.50 0.50 - 1.00 - - - - - - - 1.00 mulxq (%rax), %rbx, %rcx +# CHECK-NEXT: 0.50 0.50 - 1.00 - - - - - - - 2.00 mulxl (%rax), %ebx, %ecx +# CHECK-NEXT: - - - 1.00 - - - - - - - 2.00 mulxq %rax, %rbx, %rcx +# CHECK-NEXT: 0.50 0.50 - 1.00 - - - - - - - 2.00 mulxq (%rax), %rbx, %rcx # CHECK-NEXT: - - - - - - - - - - - - pdepl %eax, %ebx, %ecx # CHECK-NEXT: - - - - - - - - - - - - pdepl (%rax), %ebx, %ecx # CHECK-NEXT: - - - - - - - - - - - - pdepq %rax, %rbx, %rcx diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s @@ -15,12 +15,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 13 +# CHECK-NEXT: Total Cycles: 9 # CHECK-NEXT: Total uOps: 4 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.31 -# CHECK-NEXT: IPC: 0.15 +# CHECK-NEXT: uOps Per Cycle: 0.44 +# CHECK-NEXT: IPC: 0.22 # CHECK-NEXT: Block RThroughput: 0.5 # CHECK: Instruction Info: @@ -58,11 +58,10 @@ # CHECK-NEXT: - 0.50 0.50 - - 0.50 0.50 - - - - - - adcxq (%rdi), %rcx # CHECK: Timeline view: -# CHECK-NEXT: 012 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeeeeeER . . adcxq (%rdi), %rcx -# CHECK-NEXT: [1,0] D=====eeeeeER adcxq (%rdi), %rcx +# CHECK: [0,0] DeeeeeER. adcxq (%rdi), %rcx +# CHECK-NEXT: [1,0] D=eeeeeER adcxq (%rdi), %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -71,18 +70,18 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 3.5 0.5 0.0 adcxq (%rdi), %rcx +# CHECK-NEXT: 0. 2 1.5 0.5 0.0 adcxq (%rdi), %rcx # CHECK: [1] Code Region # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 13 +# CHECK-NEXT: Total Cycles: 9 # CHECK-NEXT: Total uOps: 4 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.31 -# CHECK-NEXT: IPC: 0.15 +# CHECK-NEXT: uOps Per Cycle: 0.44 +# CHECK-NEXT: IPC: 0.22 # CHECK-NEXT: Block RThroughput: 0.5 # CHECK: Instruction Info: @@ -120,11 +119,10 @@ # CHECK-NEXT: - 0.50 0.50 - - 0.50 0.50 - - - - - - adoxq (%rdi), %rcx # CHECK: Timeline view: -# CHECK-NEXT: 012 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DeeeeeER . . adoxq (%rdi), %rcx -# CHECK-NEXT: [1,0] D=====eeeeeER adoxq (%rdi), %rcx +# CHECK: [0,0] DeeeeeER. adoxq (%rdi), %rcx +# CHECK-NEXT: [1,0] D=eeeeeER adoxq (%rdi), %rcx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -133,4 +131,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 3.5 0.5 0.0 adoxq (%rdi), %rcx +# CHECK-NEXT: 0. 2 1.5 0.5 0.0 adoxq (%rdi), %rcx diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s --- a/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s +++ b/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s @@ -15,12 +15,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 17 +# CHECK-NEXT: Total Cycles: 13 # CHECK-NEXT: Total uOps: 2 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.12 -# CHECK-NEXT: IPC: 0.12 +# CHECK-NEXT: uOps Per Cycle: 0.15 +# CHECK-NEXT: IPC: 0.15 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -51,18 +51,18 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] -# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - 2.00 +# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - 2.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: -# CHECK-NEXT: - 0.50 0.50 - 2.00 - - - - - - - 2.00 mulxl (%rdi), %eax, %edx +# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - 2.00 mulxl (%rdi), %eax, %edx # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 012 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. .. mulxl (%rdi), %eax, %edx -# CHECK-NEXT: [1,0] D=======eeeeeeeER mulxl (%rdi), %eax, %edx +# CHECK: [0,0] DeeeeeeeER. . mulxl (%rdi), %eax, %edx +# CHECK-NEXT: [1,0] D===eeeeeeeER mulxl (%rdi), %eax, %edx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -71,19 +71,19 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mulxl (%rdi), %eax, %edx +# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl (%rdi), %eax, %edx # CHECK: [1] Code Region # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 17 +# CHECK-NEXT: Total Cycles: 13 # CHECK-NEXT: Total uOps: 2 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.12 -# CHECK-NEXT: IPC: 0.12 -# CHECK-NEXT: Block RThroughput: 1.0 +# CHECK-NEXT: uOps Per Cycle: 0.15 +# CHECK-NEXT: IPC: 0.15 +# CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps @@ -94,7 +94,7 @@ # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 7 1.00 * mulxq (%rdi), %rax, %rdx +# CHECK-NEXT: 1 7 2.00 * mulxq (%rdi), %rax, %rdx # CHECK: Resources: # CHECK-NEXT: [0] - Zn2AGU0 @@ -113,18 +113,18 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] -# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - 1.00 +# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - 2.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: -# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - 1.00 mulxq (%rdi), %rax, %rdx +# CHECK-NEXT: - 0.50 0.50 - 1.00 - - - - - - - 2.00 mulxq (%rdi), %rax, %rdx # CHECK: Timeline view: -# CHECK-NEXT: 0123456 +# CHECK-NEXT: 012 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. .. mulxq (%rdi), %rax, %rdx -# CHECK-NEXT: [1,0] D=======eeeeeeeER mulxq (%rdi), %rax, %rdx +# CHECK: [0,0] DeeeeeeeER. . mulxq (%rdi), %rax, %rdx +# CHECK-NEXT: [1,0] D===eeeeeeeER mulxq (%rdi), %rax, %rdx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -133,4 +133,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mulxq (%rdi), %rax, %rdx +# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq (%rdi), %rax, %rdx diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s --- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s +++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s @@ -64,8 +64,8 @@ # CHECK-NEXT: 2 5 0.33 * bzhiq %rax, (%rbx), %rcx # CHECK-NEXT: 1 3 2.00 mulxl %eax, %ebx, %ecx # CHECK-NEXT: 1 7 2.00 * mulxl (%rax), %ebx, %ecx -# CHECK-NEXT: 1 3 1.00 mulxq %rax, %rbx, %rcx -# CHECK-NEXT: 1 7 1.00 * mulxq (%rax), %rbx, %rcx +# CHECK-NEXT: 1 3 2.00 mulxq %rax, %rbx, %rcx +# CHECK-NEXT: 1 7 2.00 * mulxq (%rax), %rbx, %rcx # CHECK-NEXT: 1 100 0.25 pdepl %eax, %ebx, %ecx # CHECK-NEXT: 1 100 0.25 * pdepl (%rax), %ebx, %ecx # CHECK-NEXT: 1 100 0.25 pdepq %rax, %rbx, %rcx @@ -108,7 +108,7 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] -# CHECK-NEXT: 4.00 4.00 4.00 5.00 10.00 5.00 5.00 - - - - - 5.00 +# CHECK-NEXT: 4.00 4.00 4.00 5.00 9.00 5.00 5.00 - - - - - 8.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions: @@ -117,9 +117,9 @@ # CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - bzhiq %rax, %rbx, %rcx # CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - bzhiq %rax, (%rbx), %rcx # CHECK-NEXT: - - - - 1.00 - - - - - - - 2.00 mulxl %eax, %ebx, %ecx -# CHECK-NEXT: 0.33 0.33 0.33 - 2.00 - - - - - - - 2.00 mulxl (%rax), %ebx, %ecx -# CHECK-NEXT: - - - - 1.00 - - - - - - - - mulxq %rax, %rbx, %rcx -# CHECK-NEXT: 0.33 0.33 0.33 - 1.00 - - - - - - - 1.00 mulxq (%rax), %rbx, %rcx +# CHECK-NEXT: 0.33 0.33 0.33 - 1.00 - - - - - - - 2.00 mulxl (%rax), %ebx, %ecx +# CHECK-NEXT: - - - - 1.00 - - - - - - - 2.00 mulxq %rax, %rbx, %rcx +# CHECK-NEXT: 0.33 0.33 0.33 - 1.00 - - - - - - - 2.00 mulxq (%rax), %rbx, %rcx # CHECK-NEXT: - - - - - - - - - - - - - pdepl %eax, %ebx, %ecx # CHECK-NEXT: - - - - - - - - - - - - - pdepl (%rax), %ebx, %ecx # CHECK-NEXT: - - - - - - - - - - - - - pdepq %rax, %rbx, %rcx