Index: llvm/trunk/include/llvm/Target/TargetSchedule.td =================================================================== --- llvm/trunk/include/llvm/Target/TargetSchedule.td +++ llvm/trunk/include/llvm/Target/TargetSchedule.td @@ -453,15 +453,60 @@ SchedMachineModel SchedModel = ?; } -// Allow the definition of processor register files. -// Each processor register file declares the number of physical registers, as -// well as a optional register cost information. The cost of a register R is the -// number of physical registers used to rename R (at register renaming stage). -// That value defaults to 1, to all the registers contained in the register -// file. The set of target register files is inferred from the list of register -// classes. Register costs are defined at register class granularity. An empty -// list of register classes means that this register file contains all the -// registers defined by the target. +// Allow the definition of processor register files for register renaming +// purposes. +// +// Each processor register file declares: +// - The set of registers that can be renamed. +// - The number of physical registers which can be used for register renaming +// purpose. +// - The cost of a register rename. +// +// The cost of a rename is the number of physical registers allocated by the +// register alias table to map the new definition. By default, register can be +// renamed at the cost of a single physical register. Note that register costs +// are defined at register class granularity (see field `Costs`). +// +// The set of registers that are subject to register renaming is declared using +// a list of register classes (see field `RegClasses`). An empty list of +// register classes means: all the logical registers defined by the target can +// be fully renamed. +// +// A register R can be renamed if its register class appears in the `RegClasses` +// set. When R is written, a new alias is allocated at the cost of one or more +// physical registers; as a result, false dependencies on R are removed. +// +// A sub-register V of register R is implicitly part of the same register file. +// However, V is only renamed if its register class is part of `RegClasses`. +// Otherwise, the processor keeps it (as well as any other different part +// of R) together with R, and a write of V always causes a compulsory read of R. +// +// This is what happens for example on AMD processors (at least from Bulldozer +// onwards), where AL and AH are not treated as independent from AX, and AX is +// not treated as independent from EAX. A write to AL has an implicity false +// dependency on the last write to EAX (or a portion of EAX). As a consequence, +// a write to AL cannot go in parallel with a write to AH. +// +// There is no false dependency if the partial register write belongs to a +// register class that is in `RegClasses`. +// There is also no penalty for writes that "clear the content a super-register" +// (see MC/MCInstrAnalysis.h - method MCInstrAnalysis::clearsSuperRegisters()). +// On x86-64, 32-bit GPR writes implicitly zero the upper half of the underlying +// physical register, effectively removing any false dependencies with the +// previous register definition. +// +// TODO: This implementation assumes that there is no limit in the number of +// renames per cycle, which might not be true for all hardware or register +// classes. Also, there is no limit to how many times the same logical register +// can be renamed during the same cycle. +// +// TODO: we don't currently model merge penalties for the case where a write to +// a part of a register is followed by a read from a larger part of the same +// register. On some Intel chips, different parts of a GPR can be stored in +// different physical registers. However, there is a cost to pay for when the +// partial write is combined with the previous super-register definition. We +// should add support for these cases, and correctly model merge problems with +// partial register accesses. class RegisterFile Classes = [], list Costs = []> { list RegClasses = Classes; Index: llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td +++ llvm/trunk/lib/Target/X86/X86ScheduleBtVer2.td @@ -41,7 +41,14 @@ // The Integer PRF for Jaguar is 64 entries, and it holds the architectural and // speculative version of the 64-bit integer registers. // Reference: www.realworldtech.com/jaguar/4/ -def JIntegerPRF : RegisterFile<64, [GR8, GR16, GR32, GR64, CCR]>; +// +// The processor always keeps the different parts of an integer register +// together. An instruction that writes to a part of a register will therefore +// have a false dependence on any previous write to the same register or any +// part of it. +// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register +// access" - Agner Fog's "microarchitecture.pdf". +def JIntegerPRF : RegisterFile<64, [GR64, CCR]>; // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE // registers. Operations on 256-bit data types are cracked into two COPs. Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-2.s @@ -7,9 +7,9 @@ # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 3 -# CHECK-NEXT: Total Cycles: 10 +# CHECK-NEXT: Total Cycles: 11 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 0.30 +# CHECK-NEXT: IPC: 0.27 # CHECK-NEXT: Block RThroughput: 4.0 # CHECK: Instruction Info: @@ -26,11 +26,12 @@ # CHECK-NEXT: 1 1 0.50 addl %ecx, %ebx # CHECK: Timeline view: +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeER. imulq %rax, %rbx -# CHECK-NEXT: [0,1] .DeE----R. lzcntw %ax, %bx -# CHECK-NEXT: [0,2] .D=====eER addl %ecx, %ebx +# CHECK: [0,0] DeeeeeeER . imulq %rax, %rbx +# CHECK-NEXT: [0,1] .D=====eER. lzcntw %ax, %bx +# CHECK-NEXT: [0,2] .D======eER addl %ecx, %ebx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -40,5 +41,5 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx -# CHECK-NEXT: 1. 1 1.0 1.0 4.0 lzcntw %ax, %bx -# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addl %ecx, %ebx +# CHECK-NEXT: 1. 1 6.0 0.0 0.0 lzcntw %ax, %bx +# CHECK-NEXT: 2. 1 7.0 0.0 0.0 addl %ecx, %ebx Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-3.s @@ -12,9 +12,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 4500 -# CHECK-NEXT: Total Cycles: 2254 +# CHECK-NEXT: Total Cycles: 4503 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 2.00 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 1.5 # CHECK: Instruction Info: @@ -52,22 +52,23 @@ # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: -# CHECK-NEXT: 1.00 - - - - - - - - - - - - - addw %cx, %dx -# CHECK-NEXT: - 1.00 - - - - - - - - - - - - movw %ax, %dx +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addw %cx, %dx +# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - movw %ax, %dx # CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - xorw %bx, %dx # CHECK: Timeline view: -# CHECK-NEXT: Index 012345678 +# CHECK-NEXT: 01 +# CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . addw %cx, %dx -# CHECK-NEXT: [0,1] DeER . . movw %ax, %dx -# CHECK-NEXT: [0,2] .DeER. . xorw %bx, %dx -# CHECK-NEXT: [1,0] .D=eER . addw %cx, %dx -# CHECK-NEXT: [1,1] . DeER . movw %ax, %dx -# CHECK-NEXT: [1,2] . D=eER . xorw %bx, %dx -# CHECK-NEXT: [2,0] . D=eER. addw %cx, %dx -# CHECK-NEXT: [2,1] . DeE-R. movw %ax, %dx -# CHECK-NEXT: [2,2] . DeE-R xorw %bx, %dx +# CHECK: [0,0] DeER . .. addw %cx, %dx +# CHECK-NEXT: [0,1] D=eER. .. movw %ax, %dx +# CHECK-NEXT: [0,2] .D=eER .. xorw %bx, %dx +# CHECK-NEXT: [1,0] .D==eER .. addw %cx, %dx +# CHECK-NEXT: [1,1] . D==eER .. movw %ax, %dx +# CHECK-NEXT: [1,2] . D===eER .. xorw %bx, %dx +# CHECK-NEXT: [2,0] . D===eER.. addw %cx, %dx +# CHECK-NEXT: [2,1] . D====eER. movw %ax, %dx +# CHECK-NEXT: [2,2] . D====eER xorw %bx, %dx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -76,6 +77,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.7 0.3 0.0 addw %cx, %dx -# CHECK-NEXT: 1. 3 1.0 1.0 0.3 movw %ax, %dx -# CHECK-NEXT: 2. 3 1.3 0.0 0.3 xorw %bx, %dx +# CHECK-NEXT: 0. 3 2.7 0.3 0.0 addw %cx, %dx +# CHECK-NEXT: 1. 3 3.3 0.0 0.0 movw %ax, %dx +# CHECK-NEXT: 2. 3 3.7 0.0 0.0 xorw %bx, %dx Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-4.s @@ -12,9 +12,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 4500 -# CHECK-NEXT: Total Cycles: 3006 +# CHECK-NEXT: Total Cycles: 7503 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.50 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -57,18 +57,18 @@ # CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - addw %cx, %bx # CHECK: Timeline view: -# CHECK-NEXT: 01 +# CHECK-NEXT: 01234567 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeER .. imulw %ax, %bx -# CHECK-NEXT: [0,1] .DeE-R .. lzcntw %ax, %bx -# CHECK-NEXT: [0,2] .D=eE-R .. addw %cx, %bx -# CHECK-NEXT: [1,0] . D=eeeER .. imulw %ax, %bx -# CHECK-NEXT: [1,1] . DeE--R .. lzcntw %ax, %bx -# CHECK-NEXT: [1,2] . D=eE--R.. addw %cx, %bx -# CHECK-NEXT: [2,0] . D=eeeER. imulw %ax, %bx -# CHECK-NEXT: [2,1] . DeE--R. lzcntw %ax, %bx -# CHECK-NEXT: [2,2] . D=eE--R addw %cx, %bx +# CHECK: [0,0] DeeeER . . . imulw %ax, %bx +# CHECK-NEXT: [0,1] .D==eER . . . lzcntw %ax, %bx +# CHECK-NEXT: [0,2] .D===eER . . . addw %cx, %bx +# CHECK-NEXT: [1,0] . D===eeeER . . imulw %ax, %bx +# CHECK-NEXT: [1,1] . D=====eER . . lzcntw %ax, %bx +# CHECK-NEXT: [1,2] . D======eER . . addw %cx, %bx +# CHECK-NEXT: [2,0] . D======eeeER . imulw %ax, %bx +# CHECK-NEXT: [2,1] . D========eER. lzcntw %ax, %bx +# CHECK-NEXT: [2,2] . D=========eER addw %cx, %bx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -77,6 +77,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.7 0.3 0.0 imulw %ax, %bx -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 lzcntw %ax, %bx -# CHECK-NEXT: 2. 3 2.0 0.0 1.7 addw %cx, %bx +# CHECK-NEXT: 0. 3 4.0 0.3 0.0 imulw %ax, %bx +# CHECK-NEXT: 1. 3 6.0 0.0 0.0 lzcntw %ax, %bx +# CHECK-NEXT: 2. 3 7.0 0.0 0.0 addw %cx, %bx Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-5.s @@ -7,9 +7,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 1500 -# CHECK-NEXT: Total Cycles: 753 +# CHECK-NEXT: Total Cycles: 1503 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.99 +# CHECK-NEXT: IPC: 1.00 # CHECK-NEXT: Block RThroughput: 0.5 # CHECK: Instruction Info: @@ -48,11 +48,11 @@ # CHECK-NEXT: 0.50 0.50 - - - - - - - - - - - - lzcntw %ax, %bx # CHECK: Timeline view: -# CHECK-NEXT: Index 01234 +# CHECK-NEXT: Index 012345 -# CHECK: [0,0] DeER. lzcntw %ax, %bx -# CHECK-NEXT: [1,0] DeER. lzcntw %ax, %bx -# CHECK-NEXT: [2,0] .DeER lzcntw %ax, %bx +# CHECK: [0,0] DeER . lzcntw %ax, %bx +# CHECK-NEXT: [1,0] D=eER. lzcntw %ax, %bx +# CHECK-NEXT: [2,0] .D=eER lzcntw %ax, %bx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -61,4 +61,4 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 1.0 1.0 0.0 lzcntw %ax, %bx +# CHECK-NEXT: 0. 3 1.7 0.3 0.0 lzcntw %ax, %bx Index: llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s =================================================================== --- llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s +++ llvm/trunk/test/tools/llvm-mca/X86/BtVer2/partial-reg-update-6.s @@ -13,9 +13,9 @@ # CHECK: Iterations: 1500 # CHECK-NEXT: Instructions: 4500 -# CHECK-NEXT: Total Cycles: 4507 +# CHECK-NEXT: Total Cycles: 7504 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.00 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -54,22 +54,22 @@ # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - 1.00 - - - - - - 1.00 - - - - - imull %edx, %ecx -# CHECK-NEXT: 0.99 0.01 - - - - - 1.00 - - - - - - lzcntw (%rsp), %cx +# CHECK-NEXT: 1.00 - - - - - - 1.00 - - - - - - lzcntw (%rsp), %cx # CHECK-NEXT: 0.50 0.50 - - - - - 1.00 - - - - - - lzcntw 2(%rsp), %cx # CHECK: Timeline view: -# CHECK-NEXT: 012345 +# CHECK-NEXT: 012345678 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeER . . imull %edx, %ecx -# CHECK-NEXT: [0,1] .DeeeeER . . lzcntw (%rsp), %cx -# CHECK-NEXT: [0,2] .D=eeeeER . . lzcntw 2(%rsp), %cx -# CHECK-NEXT: [1,0] . D====eeeER . imull %edx, %ecx -# CHECK-NEXT: [1,1] . DeeeeE--R . lzcntw (%rsp), %cx -# CHECK-NEXT: [1,2] . D=eeeeE--R . lzcntw 2(%rsp), %cx -# CHECK-NEXT: [2,0] . D=====eeeER. imull %edx, %ecx -# CHECK-NEXT: [2,1] . DeeeeE---R. lzcntw (%rsp), %cx -# CHECK-NEXT: [2,2] . D=eeeeE---R lzcntw 2(%rsp), %cx +# CHECK: [0,0] DeeeER . . . imull %edx, %ecx +# CHECK-NEXT: [0,1] .DeeeeER . . . lzcntw (%rsp), %cx +# CHECK-NEXT: [0,2] .D=eeeeER . . . lzcntw 2(%rsp), %cx +# CHECK-NEXT: [1,0] . D====eeeER . . imull %edx, %ecx +# CHECK-NEXT: [1,1] . D===eeeeER . . lzcntw (%rsp), %cx +# CHECK-NEXT: [1,2] . D====eeeeER . . lzcntw 2(%rsp), %cx +# CHECK-NEXT: [2,0] . D=======eeeER . imull %edx, %ecx +# CHECK-NEXT: [2,1] . D======eeeeER. lzcntw (%rsp), %cx +# CHECK-NEXT: [2,2] . D=======eeeeER lzcntw 2(%rsp), %cx # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -78,6 +78,6 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 4.0 0.3 0.0 imull %edx, %ecx -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 lzcntw (%rsp), %cx -# CHECK-NEXT: 2. 3 2.0 2.0 1.7 lzcntw 2(%rsp), %cx +# CHECK-NEXT: 0. 3 4.7 0.3 0.0 imull %edx, %ecx +# CHECK-NEXT: 1. 3 4.0 0.3 0.0 lzcntw (%rsp), %cx +# CHECK-NEXT: 2. 3 5.0 0.0 0.0 lzcntw 2(%rsp), %cx Index: llvm/trunk/tools/llvm-mca/Instruction.h =================================================================== --- llvm/trunk/tools/llvm-mca/Instruction.h +++ llvm/trunk/tools/llvm-mca/Instruction.h @@ -101,6 +101,12 @@ // super-registers. bool ClearsSuperRegs; + // This field is set if this is a partial register write, and it has a false + // dependency on any previous write of the same register (or a portion of it). + // DependentWrite must be able to complete before this write completes, so + // that we don't break the WAW, and the two writes can be merged together. + const WriteState *DependentWrite; + // A list of dependent reads. Users is a set of dependent // reads. A dependent read is added to the set only if CyclesLeft // is "unknown". As soon as CyclesLeft is 'known', each user in the set @@ -113,7 +119,7 @@ WriteState(const WriteDescriptor &Desc, unsigned RegID, bool clearsSuperRegs = false) : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID), - ClearsSuperRegs(clearsSuperRegs) {} + ClearsSuperRegs(clearsSuperRegs), DependentWrite(nullptr) {} WriteState(const WriteState &Other) = delete; WriteState &operator=(const WriteState &Other) = delete; @@ -126,6 +132,9 @@ unsigned getNumUsers() const { return Users.size(); } bool clearsSuperRegisters() const { return ClearsSuperRegs; } + const WriteState *getDependentWrite() const { return DependentWrite; } + void setDependentWrite(const WriteState *Write) { DependentWrite = Write; } + // On every cycle, update CyclesLeft and notify dependent users. void cycleEvent(); void onInstructionIssued(); @@ -315,6 +324,7 @@ const VecUses &getUses() const { return Uses; } const InstrDesc &getDesc() const { return Desc; } unsigned getRCUTokenID() const { return RCUTokenID; } + int getCyclesLeft() const { return CyclesLeft; } unsigned getNumUsers() const { unsigned NumUsers = 0; Index: llvm/trunk/tools/llvm-mca/Instruction.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/Instruction.cpp +++ llvm/trunk/tools/llvm-mca/Instruction.cpp @@ -132,7 +132,22 @@ void Instruction::update() { assert(isDispatched() && "Unexpected instruction stage found!"); - if (llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); })) + + if (!llvm::all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); })) + return; + + // A partial register write cannot complete before a dependent write. + auto IsDefReady = [&](const UniqueDef &Def) { + if (const WriteState *Write = Def->getDependentWrite()) { + int WriteLatency = Write->getCyclesLeft(); + if (WriteLatency == UNKNOWN_CYCLES) + return false; + return static_cast(WriteLatency) < Desc.MaxLatency; + } + return true; + }; + + if (llvm::all_of(Defs, IsDefReady)) Stage = IS_READY; } @@ -141,14 +156,10 @@ return; if (isDispatched()) { - bool IsReady = true; - for (UniqueUse &Use : Uses) { + for (UniqueUse &Use : Uses) Use->cycleEvent(); - IsReady &= Use->isReady(); - } - if (IsReady) - Stage = IS_READY; + update(); return; } Index: llvm/trunk/tools/llvm-mca/RegisterFile.h =================================================================== --- llvm/trunk/tools/llvm-mca/RegisterFile.h +++ llvm/trunk/tools/llvm-mca/RegisterFile.h @@ -61,24 +61,35 @@ // regsiter file #0 specifying command line flag `-register-file-size=`. llvm::SmallVector RegisterFiles; - // This pair is used to identify the owner of a register, as well as - // the "register cost". Register cost is defined as the number of physical - // registers required to allocate a user register. + // This type is used to propagate information about the owner of a register, + // and the cost of allocating it in the PRF. Register cost is defined as the + // number of physical registers consumed by the PRF to allocate a user + // register. + // // For example: on X86 BtVer2, a YMM register consumes 2 128-bit physical // registers. So, the cost of allocating a YMM register in BtVer2 is 2. using IndexPlusCostPairTy = std::pair; + // Struct RegisterRenamingInfo maps registers to register files. + // There is a RegisterRenamingInfo object for every register defined by + // the target. RegisteRenamingInfo objects are stored into vector + // RegisterMappings, and register IDs can be used to reference them. + struct RegisterRenamingInfo { + IndexPlusCostPairTy IndexPlusCost; + llvm::MCPhysReg RenameAs; + }; + // RegisterMapping objects are mainly used to track physical register // definitions. There is a RegisterMapping for every register defined by the // Target. For each register, a RegisterMapping pair contains a descriptor of // the last register write (in the form of a WriteRef object), as well as a - // IndexPlusCostPairTy to quickly identify owning register files. + // RegisterRenamingInfo to quickly identify owning register files. // // This implementation does not allow overlapping register files. The only // register file that is allowed to overlap with other register files is // register file #0. If we exclude register #0, every register is "owned" by // at most one register file. - using RegisterMapping = std::pair; + using RegisterMapping = std::pair; // This map contains one entry for each register defined by the target. std::vector RegisterMappings; @@ -103,13 +114,12 @@ // Consumes physical registers in each register file specified by the // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`. - void allocatePhysRegs(IndexPlusCostPairTy IPC, + void allocatePhysRegs(const RegisterRenamingInfo &Entry, llvm::MutableArrayRef UsedPhysRegs); - // Releases previously allocated physical registers from the register file(s) - // referenced by the IndexPlusCostPairTy object. This method is called from - // `invalidateRegisterMapping()`. - void freePhysRegs(IndexPlusCostPairTy IPC, + // Releases previously allocated physical registers from the register file(s). + // This method is called from `invalidateRegisterMapping()`. + void freePhysRegs(const RegisterRenamingInfo &Entry, llvm::MutableArrayRef FreedPhysRegs); // Create an instance of RegisterMappingTracker for every register file @@ -140,8 +150,11 @@ // Returns a "response mask" where each bit represents the response from a // different register file. A mask of all zeroes means that all register // files are available. Otherwise, the mask can be used to identify which - // register file was busy. This sematic allows us classify dispatch dispatch + // register file was busy. This sematic allows us to classify dispatch // stalls caused by the lack of register file resources. + // + // Current implementation can simulate up to 32 register files (including the + // special register file at index #0). unsigned isAvailable(llvm::ArrayRef Regs) const; void collectWrites(llvm::SmallVectorImpl &Writes, unsigned RegID) const; Index: llvm/trunk/tools/llvm-mca/RegisterFile.cpp =================================================================== --- llvm/trunk/tools/llvm-mca/RegisterFile.cpp +++ llvm/trunk/tools/llvm-mca/RegisterFile.cpp @@ -26,7 +26,8 @@ RegisterFile::RegisterFile(const llvm::MCSchedModel &SM, const llvm::MCRegisterInfo &mri, unsigned NumRegs) - : MRI(mri), RegisterMappings(mri.getNumRegs(), {WriteRef(), {0, 0}}) { + : MRI(mri), RegisterMappings(mri.getNumRegs(), + {WriteRef(), {IndexPlusCostPairTy(0, 1), 0}}) { initialize(SM, NumRegs); } @@ -71,34 +72,46 @@ // Special case where there is no register class identifier in the set. // An empty set of register classes means: this register file contains all // the physical registers specified by the target. - if (Entries.empty()) { - for (std::pair &Mapping : RegisterMappings) - Mapping.second = std::make_pair(RegisterFileIndex, 1U); + // We optimistically assume that a register can be renamed at the cost of a + // single physical register. The constructor of RegisterFile ensures that + // a RegisterMapping exists for each logical register defined by the Target. + if (Entries.empty()) return; - } // Now update the cost of individual registers. for (const MCRegisterCostEntry &RCE : Entries) { const MCRegisterClass &RC = MRI.getRegClass(RCE.RegisterClassID); for (const MCPhysReg Reg : RC) { - IndexPlusCostPairTy &Entry = RegisterMappings[Reg].second; - if (Entry.first) { + RegisterRenamingInfo &Entry = RegisterMappings[Reg].second; + IndexPlusCostPairTy &IPC = Entry.IndexPlusCost; + if (IPC.first && IPC.first != RegisterFileIndex) { // The only register file that is allowed to overlap is the default // register file at index #0. The analysis is inaccurate if register // files overlap. errs() << "warning: register " << MRI.getName(Reg) << " defined in multiple register files."; } - Entry.first = RegisterFileIndex; - Entry.second = RCE.Cost; + IPC = std::make_pair(RegisterFileIndex, RCE.Cost); + Entry.RenameAs = Reg; + + // Assume the same cost for each sub-register. + for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) { + RegisterRenamingInfo &OtherEntry = RegisterMappings[*I].second; + if (!OtherEntry.IndexPlusCost.first && + (!OtherEntry.RenameAs || + MRI.isSuperRegister(*I, OtherEntry.RenameAs))) { + OtherEntry.IndexPlusCost = IPC; + OtherEntry.RenameAs = Reg; + } + } } } } -void RegisterFile::allocatePhysRegs(IndexPlusCostPairTy Entry, +void RegisterFile::allocatePhysRegs(const RegisterRenamingInfo &Entry, MutableArrayRef UsedPhysRegs) { - unsigned RegisterFileIndex = Entry.first; - unsigned Cost = Entry.second; + unsigned RegisterFileIndex = Entry.IndexPlusCost.first; + unsigned Cost = Entry.IndexPlusCost.second; if (RegisterFileIndex) { RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex]; RMT.NumUsedPhysRegs += Cost; @@ -110,10 +123,10 @@ UsedPhysRegs[0] += Cost; } -void RegisterFile::freePhysRegs(IndexPlusCostPairTy Entry, +void RegisterFile::freePhysRegs(const RegisterRenamingInfo &Entry, MutableArrayRef FreedPhysRegs) { - unsigned RegisterFileIndex = Entry.first; - unsigned Cost = Entry.second; + unsigned RegisterFileIndex = Entry.IndexPlusCost.first; + unsigned Cost = Entry.IndexPlusCost.second; if (RegisterFileIndex) { RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex]; RMT.NumUsedPhysRegs -= Cost; @@ -128,12 +141,48 @@ void RegisterFile::addRegisterWrite(WriteRef Write, MutableArrayRef UsedPhysRegs, bool ShouldAllocatePhysRegs) { - const WriteState &WS = *Write.getWriteState(); + WriteState &WS = *Write.getWriteState(); unsigned RegID = WS.getRegisterID(); assert(RegID && "Adding an invalid register definition?"); - RegisterMapping &Mapping = RegisterMappings[RegID]; - Mapping.first = Write; + LLVM_DEBUG({ + dbgs() << "RegisterFile: addRegisterWrite [ " << Write.getSourceIndex() + << ", " << MRI.getName(RegID) << "]\n"; + }); + + // If RenameAs is equal to RegID, then RegID is subject to register renaming + // and false dependencies on RegID are all eliminated. + + // If RenameAs references the invalid register, then we optimistically assume + // that it can be renamed. In the absence of tablegen descriptors for register + // files, RenameAs is always set to the invalid register ID. In all other + // cases, RenameAs must be either equal to RegID, or it must reference a + // super-register of RegID. + + // If RenameAs is a super-register of RegID, then a write to RegID has always + // a false dependency on RenameAs. The only exception is for when the write + // implicitly clears the upper portion of the underlying register. + // If a write clears its super-registers, then it is renamed as `RenameAs`. + const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; + if (RRI.RenameAs && RRI.RenameAs != RegID) { + RegID = RRI.RenameAs; + const WriteRef &OtherWrite = RegisterMappings[RegID].first; + + if (!WS.clearsSuperRegisters()) { + // The processor keeps the definition of `RegID` together with register + // `RenameAs`. Since this partial write is not renamed, no physical + // register is allocated. + ShouldAllocatePhysRegs = false; + + if (OtherWrite.getSourceIndex() != Write.getSourceIndex()) { + // This partial write has a false dependency on RenameAs. + WS.setDependentWrite(OtherWrite.getWriteState()); + } + } + } + + // Update the mapping for register RegID including its sub-registers. + RegisterMappings[RegID].first = Write; for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) RegisterMappings[*I].first = Write; @@ -141,9 +190,8 @@ // hardware. For example, zero-latency data-dependency breaking instructions // don't consume physical registers. if (ShouldAllocatePhysRegs) - allocatePhysRegs(Mapping.second, UsedPhysRegs); + allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs); - // If this is a partial update, then we are done. if (!WS.clearsSuperRegisters()) return; @@ -155,42 +203,50 @@ MutableArrayRef FreedPhysRegs, bool ShouldFreePhysRegs) { unsigned RegID = WS.getRegisterID(); - bool ShouldInvalidateSuperRegs = WS.clearsSuperRegisters(); assert(RegID != 0 && "Invalidating an already invalid register?"); - assert(WS.getCyclesLeft() != -512 && + assert(WS.getCyclesLeft() != UNKNOWN_CYCLES && "Invalidating a write of unknown cycles!"); assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!"); - RegisterMapping &Mapping = RegisterMappings[RegID]; - WriteRef &WR = Mapping.first; - if (!WR.isValid()) - return; + + unsigned RenameAs = RegisterMappings[RegID].second.RenameAs; + if (RenameAs && RenameAs != RegID) { + RegID = RenameAs; + + if (!WS.clearsSuperRegisters()) { + // Keep the definition of `RegID` together with register `RenameAs`. + ShouldFreePhysRegs = false; + } + } if (ShouldFreePhysRegs) - freePhysRegs(Mapping.second, FreedPhysRegs); + freePhysRegs(RegisterMappings[RegID].second, FreedPhysRegs); + WriteRef &WR = RegisterMappings[RegID].first; if (WR.getWriteState() == &WS) WR.invalidate(); for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) { - WR = RegisterMappings[*I].first; - if (WR.getWriteState() == &WS) - WR.invalidate(); + WriteRef &OtherWR = RegisterMappings[*I].first; + if (OtherWR.getWriteState() == &WS) + OtherWR.invalidate(); } - if (!ShouldInvalidateSuperRegs) + if (!WS.clearsSuperRegisters()) return; for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) { - WR = RegisterMappings[*I].first; - if (WR.getWriteState() == &WS) - WR.invalidate(); + WriteRef &OtherWR = RegisterMappings[*I].first; + if (OtherWR.getWriteState() == &WS) + OtherWR.invalidate(); } } void RegisterFile::collectWrites(SmallVectorImpl &Writes, unsigned RegID) const { assert(RegID && RegID < RegisterMappings.size()); + LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register " + << MRI.getName(RegID) << '\n'); const WriteRef &WR = RegisterMappings[RegID].first; if (WR.isValid()) Writes.push_back(WR); @@ -225,7 +281,8 @@ // Find how many new mappings must be created for each register file. for (const unsigned RegID : Regs) { - const IndexPlusCostPairTy &Entry = RegisterMappings[RegID].second; + const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; + const IndexPlusCostPairTy &Entry = RRI.IndexPlusCost; if (Entry.first) NumPhysRegs[Entry.first] += Entry.second; NumPhysRegs[0] += Entry.second; @@ -266,10 +323,10 @@ const RegisterMapping &RM = RegisterMappings[I]; if (!RM.first.getWriteState()) continue; - const std::pair &IndexPlusCost = RM.second; - dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << IndexPlusCost.first - << ", Cost=" << IndexPlusCost.second - << ", "; + const RegisterRenamingInfo &RRI = RM.second; + dbgs() << MRI.getName(I) << ", " << I << ", PRF=" << RRI.IndexPlusCost.first + << ", Cost=" << RRI.IndexPlusCost.second + << ", RenameAs=" << RRI.RenameAs << ", "; RM.first.dump(); dbgs() << '\n'; }