Index: llvm/lib/MCA/HardwareUnits/RegisterFile.cpp =================================================================== --- llvm/lib/MCA/HardwareUnits/RegisterFile.cpp +++ llvm/lib/MCA/HardwareUnits/RegisterFile.cpp @@ -288,6 +288,17 @@ // If this move has been eliminated, then method tryEliminateMoveOrSwap should // have already updated all the register mappings. if (!IsEliminated) { + // Check if this is one of multiple writes performed by this + // instruction to register RegID. + const WriteRef &OtherWrite = RegISterMappings[RegID].first; + const WriteState *OtherWS = OtherWrite.getWriteState(); + if (OtherWS && OtherWrite.getSourceIndex() == Write.getSourceIndex()) { + if (OtherWS->getLatency() > WS.getLatency()) { + // Conservatively keep the slowest write to RegID. + return; + } + } + // Update the mapping for register RegID including its sub-registers. RegisterMappings[RegID].first = Write; RegisterMappings[RegID].second.AliasRegID = 0U; Index: llvm/lib/Target/X86/X86InstrArithmetic.td =================================================================== --- llvm/lib/Target/X86/X86InstrArithmetic.td +++ llvm/lib/Target/X86/X86InstrArithmetic.td @@ -1497,13 +1497,13 @@ let hasSideEffects = 0 in { def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), - []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>; + []>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>; let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), []>, T8XD, VEX_4V, - Sched<[sched.Folded, WriteIMulHLd, + Sched<[WriteIMulHLd, sched.Folded, // Memory operand. ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, // Implicit read of EDX/RDX Index: llvm/lib/Target/X86/X86SchedBroadwell.td =================================================================== --- llvm/lib/Target/X86/X86SchedBroadwell.td +++ llvm/lib/Target/X86/X86SchedBroadwell.td @@ -142,14 +142,14 @@ defm : X86WriteRes; defm : BWWriteResPair; defm : BWWriteResPair; -defm : BWWriteResPair; +defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; -defm : BWWriteResPair; +defm : BWWriteResPair; defm : BWWriteResPair; defm : BWWriteResPair; -def BWWriteIMulH : WriteRes { let Latency = 3; } +def BWWriteIMulH : WriteRes { let Latency = 4; } def : WriteRes { let Latency = !add(BWWriteIMulH.Latency, BroadwellModel.LoadLatency); } Index: llvm/lib/Target/X86/X86SchedHaswell.td =================================================================== --- llvm/lib/Target/X86/X86SchedHaswell.td +++ llvm/lib/Target/X86/X86SchedHaswell.td @@ -144,14 +144,14 @@ defm : X86WriteRes; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -defm : HWWriteResPair; +defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; -def HWWriteIMulH : WriteRes { let Latency = 3; } +def HWWriteIMulH : WriteRes { let Latency = 4; } def : WriteRes { let Latency = !add(HWWriteIMulH.Latency, HaswellModel.LoadLatency); } Index: llvm/lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- llvm/lib/Target/X86/X86SchedSandyBridge.td +++ llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -124,14 +124,14 @@ defm : X86WriteRes; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -defm : SBWriteResPair; +defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; -def SBWriteIMulH : WriteRes { let Latency = 3; } +def SBWriteIMulH : WriteRes { let Latency = 4; } def : WriteRes { let Latency = !add(SBWriteIMulH.Latency, SandyBridgeModel.LoadLatency); } Index: llvm/lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -122,14 +122,14 @@ defm : X86WriteRes; defm : SKLWriteResPair; defm : SKLWriteResPair; -defm : SKLWriteResPair; +defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; -defm : SKLWriteResPair; +defm : SKLWriteResPair; defm : SKLWriteResPair; defm : SKLWriteResPair; -def SKLWriteIMulH : WriteRes { let Latency = 3; } +def SKLWriteIMulH : WriteRes { let Latency = 4; } def : WriteRes { let Latency = !add(SKLWriteIMulH.Latency, SkylakeClientModel.LoadLatency); } Index: llvm/lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -123,14 +123,14 @@ defm : X86WriteRes; defm : X86WriteRes; defm : SKXWriteResPair; -defm : SKXWriteResPair; +defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; -defm : SKXWriteResPair; +defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; -def SKXWriteIMulH : WriteRes { let Latency = 3; } +def SKXWriteIMulH : WriteRes { let Latency = 4; } def : WriteRes { let Latency = !add(SKXWriteIMulH.Latency, SkylakeServerModel.LoadLatency); } Index: llvm/test/tools/llvm-mca/X86/Haswell/mulx-same-regs.s =================================================================== --- llvm/test/tools/llvm-mca/X86/Haswell/mulx-same-regs.s +++ llvm/test/tools/llvm-mca/X86/Haswell/mulx-same-regs.s @@ -16,12 +16,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Total uOps: 8 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.80 -# CHECK-NEXT: IPC: 0.20 +# CHECK-NEXT: uOps Per Cycle: 0.73 +# CHECK-NEXT: IPC: 0.18 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -56,10 +56,11 @@ # CHECK-NEXT: - - 0.50 1.00 - - - 0.50 1.00 - mulxl %eax, %eax, %eax # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax -# CHECK-NEXT: [1,0] .D==eeeeER mulxl %eax, %eax, %eax +# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax +# CHECK-NEXT: [1,0] .D===eeeeER mulxl %eax, %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -68,18 +69,18 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl %eax, %eax, %eax +# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %eax # CHECK: [1] Code Region # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Total uOps: 6 # CHECK: Dispatch Width: 4 -# CHECK-NEXT: uOps Per Cycle: 0.60 -# CHECK-NEXT: IPC: 0.20 +# CHECK-NEXT: uOps Per Cycle: 0.55 +# CHECK-NEXT: IPC: 0.18 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -114,10 +115,11 @@ # CHECK-NEXT: - - - 1.00 - - - - 1.00 - mulxq %rax, %rax, %rax # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax -# CHECK-NEXT: [1,0] .D==eeeeER mulxq %rax, %rax, %rax +# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax +# CHECK-NEXT: [1,0] .D===eeeeER mulxq %rax, %rax, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -126,4 +128,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxq %rax, %rax, %rax +# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rax Index: llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-same-regs.s =================================================================== --- llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-same-regs.s +++ llvm/test/tools/llvm-mca/X86/SkylakeClient/mulx-same-regs.s @@ -16,12 +16,12 @@ # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Total uOps: 8 # CHECK: Dispatch Width: 6 -# CHECK-NEXT: uOps Per Cycle: 0.80 -# CHECK-NEXT: IPC: 0.20 +# CHECK-NEXT: uOps Per Cycle: 0.73 +# CHECK-NEXT: IPC: 0.18 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -56,10 +56,11 @@ # CHECK-NEXT: - - 0.50 1.00 - - - 0.50 1.00 - mulxl %eax, %eax, %eax # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax -# CHECK-NEXT: [1,0] .D==eeeeER mulxl %eax, %eax, %eax +# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax +# CHECK-NEXT: [1,0] .D===eeeeER mulxl %eax, %eax, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -68,18 +69,18 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl %eax, %eax, %eax +# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %eax # CHECK: [1] Code Region # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 2 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Total uOps: 6 # CHECK: Dispatch Width: 6 -# CHECK-NEXT: uOps Per Cycle: 0.60 -# CHECK-NEXT: IPC: 0.20 +# CHECK-NEXT: uOps Per Cycle: 0.55 +# CHECK-NEXT: IPC: 0.18 # CHECK-NEXT: Block RThroughput: 1.0 # CHECK: Instruction Info: @@ -114,10 +115,11 @@ # CHECK-NEXT: - - - 1.00 - - - 1.00 - - mulxq %rax, %rax, %rax # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax -# CHECK-NEXT: [1,0] D===eeeeER mulxq %rax, %rax, %rax +# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax +# CHECK-NEXT: [1,0] D====eeeeER mulxq %rax, %rax, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -126,4 +128,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rax +# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxq %rax, %rax, %rax