Index: include/llvm/MC/MCSchedule.h =================================================================== --- include/llvm/MC/MCSchedule.h +++ include/llvm/MC/MCSchedule.h @@ -14,6 +14,7 @@ #ifndef LLVM_MC_MCSCHEDULE_H #define LLVM_MC_MCSCHEDULE_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/DataTypes.h" @@ -369,6 +370,11 @@ getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, const MCInst &Inst) const; + /// Returns the maximum forwarding delay for register reads dependent on + /// writes of scheduling class WriteResourceIdx. + static unsigned getForwardingDelayCycles(ArrayRef Entries, + unsigned WriteResourceIdx = 0); + /// Returns the default initialized model. static const MCSchedModel &GetDefaultSchedModel() { return Default; } static const MCSchedModel Default; Index: include/llvm/MC/MCSubtargetInfo.h =================================================================== --- include/llvm/MC/MCSubtargetInfo.h +++ include/llvm/MC/MCSubtargetInfo.h @@ -152,6 +152,16 @@ return 0; } + /// Return the set of ReadAdvance entries declared by the scheduling class + /// descriptor in input. + ArrayRef + getReadAdvanceEntries(const MCSchedClassDesc &SC) const { + if (!SC.NumReadAdvanceEntries) + return ArrayRef(); + return ArrayRef(&ReadAdvanceTable[SC.ReadAdvanceIdx], + SC.NumReadAdvanceEntries); + } + /// Get scheduling itinerary of a CPU. InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const; Index: include/llvm/MCA/Instruction.h =================================================================== --- include/llvm/MCA/Instruction.h +++ include/llvm/MCA/Instruction.h @@ -332,6 +332,10 @@ unsigned MaxLatency; // Number of MicroOps for this instruction. unsigned NumMicroOps; + // SchedClassID used to construct this InstrDesc. + // This information is currently used by views to do fast queries on the + // subtarget when computing the reciprocal throughput. + unsigned SchedClassID; bool MayLoad; bool MayStore; Index: lib/CodeGen/TargetSubtargetInfo.cpp =================================================================== --- lib/CodeGen/TargetSubtargetInfo.cpp +++ lib/CodeGen/TargetSubtargetInfo.cpp @@ -88,6 +88,12 @@ TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency = TSchedModel.computeInstrLatency(&MI); + + // Add extra latency due to forwarding delays. + const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + double RThroughput = TSchedModel.computeReciprocalThroughput(&MI); return createSchedInfoStr(Latency, RThroughput); } @@ -99,9 +105,17 @@ TargetSchedModel TSchedModel; TSchedModel.init(this); unsigned Latency; - if (TSchedModel.hasInstrSchedModel()) + if (TSchedModel.hasInstrSchedModel()) { Latency = TSchedModel.computeInstrLatency(MCI); - else if (TSchedModel.hasInstrItineraries()) { + // Add extra latency due to forwarding delays. + const MCSchedModel &SM = *TSchedModel.getMCSchedModel(); + unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass(); + while (SM.getSchedClassDesc(SClassID)->isVariant()) + SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID); + const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID); + Latency += + MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc)); + } else if (TSchedModel.hasInstrItineraries()) { auto *ItinData = TSchedModel.getInstrItineraries(); Latency = ItinData->getStageLatency( getInstrInfo()->get(MCI.getOpcode()).getSchedClass()); Index: lib/MC/MCSchedule.cpp =================================================================== --- lib/MC/MCSchedule.cpp +++ lib/MC/MCSchedule.cpp @@ -149,3 +149,19 @@ // that it can execute at the maximum default issue width. return 1.0 / DefaultIssueWidth; } + +unsigned +MCSchedModel::getForwardingDelayCycles(ArrayRef Entries, + unsigned WriteResourceID) { + if (Entries.empty()) + return 0; + + int DelayCycles = 0; + for (const MCReadAdvanceEntry &E : Entries) { + if (E.WriteResourceID != WriteResourceID) + continue; + DelayCycles = std::min(DelayCycles, E.Cycles); + } + + return std::abs(DelayCycles); +} Index: lib/MCA/InstrBuilder.cpp =================================================================== --- lib/MCA/InstrBuilder.cpp +++ lib/MCA/InstrBuilder.cpp @@ -532,6 +532,7 @@ // Create a new empty descriptor. std::unique_ptr ID = llvm::make_unique(); ID->NumMicroOps = SCDesc.NumMicroOps; + ID->SchedClassID = SchedClassID; if (MCDesc.isCall() && FirstCallInst) { // We don't correctly model calls. Index: lib/Target/X86/X86InstrMMX.td =================================================================== --- lib/Target/X86/X86InstrMMX.td +++ lib/Target/X86/X86InstrMMX.td @@ -543,7 +543,7 @@ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, (outs VR64:$dst), Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4122,7 +4122,7 @@ "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : Ii8<0xC4, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i16mem:$src2, u8imm:$src3), @@ -5577,7 +5577,7 @@ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8, - Sched<[WriteVecInsert]>; + Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; def rm : SS4AIi8; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. Index: lib/Target/X86/X86SchedHaswell.td =================================================================== --- lib/Target/X86/X86SchedHaswell.td +++ lib/Target/X86/X86SchedHaswell.td @@ -86,6 +86,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. Index: lib/Target/X86/X86SchedSandyBridge.td =================================================================== --- lib/Target/X86/X86SchedSandyBridge.td +++ lib/Target/X86/X86SchedSandyBridge.td @@ -76,6 +76,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. Index: lib/Target/X86/X86SchedSkylakeClient.td =================================================================== --- lib/Target/X86/X86SchedSkylakeClient.td +++ lib/Target/X86/X86SchedSkylakeClient.td @@ -80,6 +80,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. Index: lib/Target/X86/X86SchedSkylakeServer.td =================================================================== --- lib/Target/X86/X86SchedSkylakeServer.td +++ lib/Target/X86/X86SchedSkylakeServer.td @@ -80,6 +80,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. Index: lib/Target/X86/X86Schedule.td =================================================================== --- lib/Target/X86/X86Schedule.td +++ lib/Target/X86/X86Schedule.td @@ -17,6 +17,12 @@ def ReadAfterVecXLd : SchedRead; def ReadAfterVecYLd : SchedRead; +// Instructions that move data between general purpose registers and vector +// registers may be subject to extra latency due to data bypass delays. +// This SchedRead describes a bypass delay caused by data being moved from the +// integer unit to the floating point unit. +def ReadInt2Fpu : SchedRead; + // Instructions with both a load and a store folded are modeled as a folded // load + WriteRMW. def WriteRMW : SchedWrite; Index: lib/Target/X86/X86ScheduleAtom.td =================================================================== --- lib/Target/X86/X86ScheduleAtom.td +++ lib/Target/X86/X86ScheduleAtom.td @@ -46,6 +46,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. Index: lib/Target/X86/X86ScheduleBdVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBdVer2.td +++ lib/Target/X86/X86ScheduleBdVer2.td @@ -250,6 +250,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // A folded store needs a cycle on the PdStore for the store data. def : WriteRes; Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -108,6 +108,11 @@ def : ReadAdvance; def : ReadAdvance; +/// "Additional 6 cycle transfer operation which moves a floating point +/// operation input value from the integer unit to the floating point unit. +/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when dispatched by the schedulers. @@ -540,7 +545,7 @@ // Vector insert/extract operations. //////////////////////////////////////////////////////////////////////////////// -defm : X86WriteRes; +defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; Index: lib/Target/X86/X86ScheduleSLM.td =================================================================== --- lib/Target/X86/X86ScheduleSLM.td +++ lib/Target/X86/X86ScheduleSLM.td @@ -52,6 +52,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops when queued in the reservation station. Index: lib/Target/X86/X86ScheduleZnver1.td =================================================================== --- lib/Target/X86/X86ScheduleZnver1.td +++ lib/Target/X86/X86ScheduleZnver1.td @@ -94,6 +94,8 @@ def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; + // The Integer PRF for Zen is 168 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: "Software Optimization Guide for AMD Family 17h Processors" Index: test/CodeGen/X86/mmx-schedule.ll =================================================================== --- test/CodeGen/X86/mmx-schedule.ll +++ test/CodeGen/X86/mmx-schedule.ll @@ -3887,8 +3887,8 @@ ; ; BTVER2-LABEL: test_pinsrw: ; BTVER2: # %bb.0: -; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00] +; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [7:0.50] ; BTVER2-NEXT: movq %mm0, %rax # sched: [4:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] Index: test/CodeGen/X86/sse41-schedule.ll =================================================================== --- test/CodeGen/X86/sse41-schedule.ll +++ test/CodeGen/X86/sse41-schedule.ll @@ -2679,15 +2679,15 @@ ; ; BTVER2-SSE-LABEL: test_pinsrq: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50] ; BTVER2-SSE-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00] +; BTVER2-SSE-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [7:0.50] ; BTVER2-SSE-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_pinsrq: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50] ; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [4:1.00] +; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [7:0.50] ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; Index: test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s +++ test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-1.s @@ -27,12 +27,12 @@ # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -76,12 +76,12 @@ # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -125,12 +125,12 @@ # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -174,12 +174,12 @@ # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1000 -# CHECK-NEXT: Total Cycles: 7003 +# CHECK-NEXT: Total Cycles: 1003 # CHECK-NEXT: Total uOps: 2000 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.29 -# CHECK-NEXT: IPC: 0.14 +# CHECK-NEXT: uOps Per Cycle: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: Index: test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s +++ test/tools/llvm-mca/X86/BtVer2/int-to-fpu-forwarding-3.s @@ -9,12 +9,12 @@ # CHECK: Iterations: 500 # CHECK-NEXT: Instructions: 1500 -# CHECK-NEXT: Total Cycles: 7004 +# CHECK-NEXT: Total Cycles: 1509 # CHECK-NEXT: Total uOps: 2500 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.36 -# CHECK-NEXT: IPC: 0.21 +# CHECK-NEXT: uOps Per Cycle: 1.66 +# CHECK-NEXT: IPC: 0.99 # CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Instruction Info: @@ -57,18 +57,18 @@ # CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 0123456789 -# CHECK-NEXT: Index 0123456789 0123456789 012345 +# CHECK-NEXT: 01234567 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . . . . . . . . addl %eax, %eax -# CHECK-NEXT: [0,1] .DeeeeeeeER . . . . . . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [0,2] . D======eeeeeeeER . . . . . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,0] . DeE-----------R . . . . . . addl %eax, %eax -# CHECK-NEXT: [1,1] . D===========eeeeeeeER. . . . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [1,2] . D=================eeeeeeeER . . . vpinsrb $1, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,0] . .DeE----------------------R . . . addl %eax, %eax -# CHECK-NEXT: [2,1] . . D======================eeeeeeeER . . vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: [2,2] . . D============================eeeeeeeER vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK: [0,0] DeER . . . . addl %eax, %eax +# CHECK-NEXT: [0,1] .D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [0,2] . D======eER . . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,0] . DeE-----R . . addl %eax, %eax +# CHECK-NEXT: [1,1] . D======eER . . vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [1,2] . D======eER. . vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,0] . .DeE-----R. . addl %eax, %eax +# CHECK-NEXT: [2,1] . . D======eER. vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: [2,2] . . D======eER vpinsrb $1, %eax, %xmm0, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -77,6 +77,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.0 1.0 11.0 addl %eax, %eax -# CHECK-NEXT: 1. 3 12.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 -# CHECK-NEXT: 2. 3 18.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 0. 3 1.0 1.0 3.3 addl %eax, %eax +# CHECK-NEXT: 1. 3 7.0 0.0 0.0 vpinsrb $0, %eax, %xmm0, %xmm0 +# CHECK-NEXT: 2. 3 7.0 0.0 0.0 vpinsrb $1, %eax, %xmm0, %xmm0 Index: tools/llvm-mca/Views/InstructionInfoView.cpp =================================================================== --- tools/llvm-mca/Views/InstructionInfoView.cpp +++ tools/llvm-mca/Views/InstructionInfoView.cpp @@ -43,6 +43,9 @@ const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID); unsigned NumMicroOpcodes = SCDesc.NumMicroOps; unsigned Latency = MCSchedModel::computeInstrLatency(STI, SCDesc); + // Add extra latency due to delays in the forwarding data paths. + Latency += MCSchedModel::getForwardingDelayCycles( + STI.getReadAdvanceEntries(SCDesc)); Optional RThroughput = MCSchedModel::getReciprocalThroughput(STI, SCDesc);