Index: include/llvm/CodeGen/TargetSubtargetInfo.h =================================================================== --- include/llvm/CodeGen/TargetSubtargetInfo.h +++ include/llvm/CodeGen/TargetSubtargetInfo.h @@ -169,6 +169,21 @@ return isZeroIdiom(MI, Mask); } + /// Returns true if MI is a candidate for move elimination. + /// + /// A candidate for move elimination may be optimized out at register renaming + /// stage. Subtargets can specify the set of optimizable moves by + /// instantiating tablegen class `IsOptimizableMoveFunction` (see + /// llvm/Target/TargetInstrPredicate.td). + /// + /// SubtargetEmitter is responsible for processing all the + /// IsOptimizableMoveFunction definitions, and auto-generate an override for + /// this method in a target specific tablegen'd TargetSubtargetInfo derived + /// class. + virtual bool isOptimizableRegisterMove(const MachineInstr *MI) const { + return false; + } + /// True if the subtarget should run MachineScheduler after aggressive /// coalescing. /// Index: include/llvm/MC/MCInstrAnalysis.h =================================================================== --- include/llvm/MC/MCInstrAnalysis.h +++ include/llvm/MC/MCInstrAnalysis.h @@ -136,6 +136,17 @@ return isZeroIdiom(MI, Mask, CPUID); } + /// Returns true if MI is a candidate for move elimination. + /// + /// Different subtargets may apply different constraints to optimizable + /// register moves. For example, on most X86 subtargets, a candidate for move + /// elimination cannot specify the same register for both source and + /// destination. + virtual bool isOptimizableRegisterMove(const MCInst &MI, + unsigned CPUID) const { + return false; + } + /// Given a branch instruction try to get the address the branch /// targets. Return true on success, and the address in Target. virtual bool Index: include/llvm/MC/MCSchedule.h =================================================================== --- include/llvm/MC/MCSchedule.h +++ include/llvm/MC/MCSchedule.h @@ -142,6 +142,7 @@ struct MCRegisterCostEntry { unsigned RegisterClassID; unsigned Cost; + bool AllowMoveElimination; }; /// A register file descriptor. @@ -159,6 +160,13 @@ uint16_t NumRegisterCostEntries; // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. uint16_t RegisterCostEntryIdx; + // A value of zero means: there is no limit in the number of moves that can be + // eliminated every cycle. + uint16_t MaxMovesEliminatedPerCycle; + // On some processor register files, moves can only be eliminated if the + // source register operand is known to be zero. + // This flag is set if the PRF only knows how to optimize zero register moves. + bool AllowZeroMoveEliminationOnly; }; /// Provide extra details about the machine processor. Index: include/llvm/Target/TargetInstrPredicate.td =================================================================== --- include/llvm/Target/TargetInstrPredicate.td +++ include/llvm/Target/TargetInstrPredicate.td @@ -328,3 +328,8 @@ class IsDepBreakingFunction classes> : STIPredicate; + + +def IsOptimizableMoveDecl : STIPredicateDecl<"isOptimizableRegisterMove">; +class IsOptimizableMoveFunction classes> + : STIPredicate; Index: include/llvm/Target/TargetSchedule.td =================================================================== --- include/llvm/Target/TargetSchedule.td +++ include/llvm/Target/TargetSchedule.td @@ -460,6 +460,10 @@ // - The number of physical registers which can be used for register renaming // purpose. // - The cost of a register rename. +// - The set of registers that allow move elimination +// - The maximum number of moves that can be eliminated every cycle. +// - Whether move elimination is limited to register moves whose input +// is known to be zero. // // The cost of a rename is the number of physical registers allocated by the // register alias table to map the new definition. By default, register can be @@ -506,11 +510,35 @@ // partial write is combined with the previous super-register definition. We // should add support for these cases, and correctly model merge problems with // partial register accesses. +// +// Field MaxMovesEliminatedPerCycle specifies how many moves can be eliminated +// every cycle. A default value of zero for that field means: ther is no limit +// to the number of moves that can be eliminated by this register file. +// +// An instruction MI is a candidate for move elimination if a call to +// method TargetSubtargetInfo::isOptimizableRegisterMove(MI) returns true (see +// llvm/CodeGen/TargetSubtargetInfo.h, and llvm/MC/MCInstrAnalysis.h). +// +// Subtargets can instantiate tablegen class IsOptimizableMoveFunction (see +// llvm/Target/TargetInstrPredicate.td) to customize the set of move elimination +// candidates. By default, no instruction is a valid move elimination candidate. +// +// A register move MI is eliminated only if: +// - MI is a move elimination candidate +// - The destination register is from a register class that allows move +// elimination (see field `AllowMoveElimination` below). +// - Constraints on the move kind, and the maximum number of moves that can be +// eliminated per cycle are all met. + class RegisterFile Classes = [], - list Costs = []> { + list Costs = [], list AllowMoveElim = [], + int MaxMoveElimPerCy = 0, bit AllowZeroMoveElimOnly = 0> { list RegClasses = Classes; list RegCosts = Costs; + list AllowMoveElimination = AllowMoveElim; int NumPhysRegs = numPhysRegs; + int MaxMovesEliminatedPerCycle = MaxMoveElimPerCy; + bit AllowZeroMoveEliminationOnly = AllowZeroMoveElimOnly; SchedMachineModel SchedModel = ?; } Index: lib/Target/X86/X86ScheduleBtVer2.td =================================================================== --- lib/Target/X86/X86ScheduleBtVer2.td +++ lib/Target/X86/X86ScheduleBtVer2.td @@ -53,7 +53,17 @@ // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE // registers. Operations on 256-bit data types are cracked into two COPs. // Reference: www.realworldtech.com/jaguar/4/ -def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>; + +// The PRF in the floating point unit can eliminate a move from a MMX or SSE +// register that is know to be zero (i.e. it has been zeroed using a zero-idiom +// dependency breaking instruction, or via VZEROALL). +// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking +// instructions" - Agner Fog's "microarchitecture.pdf" +def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], + // AllowMoveElimination only for VR64 and VR128. + [1, 1, 0], + 0, // Unlimited number of moves eliminated per cycle. + 1>; // Only allow zero move elimination. // The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can // retire up to two macro-ops per cycle. @@ -805,4 +815,21 @@ ], ZeroIdiomPredicate> ]>; +def : IsOptimizableMoveFunction<[ + InstructionEquivalenceClass<[ + // MMX variants. + MMX_MOVQ64rr, + + // SSE variants. + MOVAPSrr, MOVUPSrr, + MOVAPDrr, MOVUPDrr, + MOVDQArr, MOVDQUrr, + + // AVX variants. + VMOVAPSrr, VMOVUPSrr, + VMOVAPDrr, VMOVUPDrr, + VMOVDQArr, VMOVDQUrr + ], CheckNot> > +]>; + } // SchedModel Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s @@ -32,13 +32,13 @@ # CHECK-NEXT: 1 3 1.00 vaddps %xmm1, %xmm1, %xmm2 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 6 -# CHECK-NEXT: Max number of mappings used: 5 +# CHECK-NEXT: Total number of mappings created: 3 +# CHECK-NEXT: Max number of mappings used: 3 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 6 -# CHECK-NEXT: Max number of mappings used: 5 +# CHECK-NEXT: Total number of mappings created: 3 +# CHECK-NEXT: Max number of mappings used: 3 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -63,25 +63,25 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmovaps %xmm0, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovaps %xmm0, %xmm1 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm1, %xmm1, %xmm2 # CHECK: Timeline view: # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DR . . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [0,1] DeER . . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [0,1] DR . . vmovaps %xmm0, %xmm1 # CHECK-NEXT: [0,2] .DeeeER . vaddps %xmm1, %xmm1, %xmm2 # CHECK-NEXT: [1,0] .D----R . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [1,1] . DeE--R . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [1,2] . D=eeeER. vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . D----R . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [1,2] . DeeeER . vaddps %xmm1, %xmm1, %xmm2 # CHECK-NEXT: [2,0] . D----R. vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [2,1] . DeE---R vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [2,1] . D----R. vmovaps %xmm0, %xmm1 # CHECK-NEXT: [2,2] . DeeeER vaddps %xmm1, %xmm1, %xmm2 # CHECK: Average Wait times (based on the timeline view): @@ -92,5 +92,5 @@ # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 vmovaps %xmm0, %xmm1 -# CHECK-NEXT: 2. 3 1.3 0.0 0.0 vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: 1. 3 0.0 0.0 2.7 vmovaps %xmm0, %xmm1 +# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vaddps %xmm1, %xmm1, %xmm2 Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s @@ -14,12 +14,12 @@ # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 27 -# CHECK-NEXT: Total Cycles: 19 +# CHECK-NEXT: Total Cycles: 15 # CHECK-NEXT: Total uOps: 27 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.42 -# CHECK-NEXT: IPC: 1.42 +# CHECK-NEXT: uOps Per Cycle: 1.80 +# CHECK-NEXT: IPC: 1.80 # CHECK-NEXT: Block RThroughput: 4.5 # CHECK: Instruction Info: @@ -42,13 +42,13 @@ # CHECK-NEXT: 1 1 0.50 movdqu %xmm5, %xmm0 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 21 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 21 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -73,51 +73,51 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 2.00 2.00 3.33 3.67 - - - - 1.33 1.67 - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - pxor %mm0, %mm0 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - movq %mm0, %mm1 +# CHECK-NEXT: - - - - - - - - - - - - - - movq %mm0, %mm1 # CHECK-NEXT: - - - - - - - - - - - - - - xorps %xmm0, %xmm0 -# CHECK-NEXT: - - - - 1.00 0.33 0.67 - - - - - - - movaps %xmm0, %xmm1 -# CHECK-NEXT: - - - 1.00 - 0.33 0.67 - - - - - - - movups %xmm1, %xmm2 -# CHECK-NEXT: - - - - 1.00 0.67 0.33 - - - - - - - movapd %xmm2, %xmm3 -# CHECK-NEXT: - - - 1.00 - 0.33 0.67 - - - - - - - movupd %xmm3, %xmm4 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - movdqa %xmm4, %xmm5 -# CHECK-NEXT: - - - - - 0.67 0.33 - - - - 0.33 0.67 - movdqu %xmm5, %xmm0 +# CHECK-NEXT: - - - - - - - - - - - - - - movaps %xmm0, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - movups %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - movapd %xmm2, %xmm3 +# CHECK-NEXT: - - - - - - - - - - - - - - movupd %xmm3, %xmm4 +# CHECK-NEXT: - - - - - - - - - - - - - - movdqa %xmm4, %xmm5 +# CHECK-NEXT: - - - - - - - - - - - - - - movdqu %xmm5, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 012345678 +# CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DR . . . . pxor %mm0, %mm0 -# CHECK-NEXT: [0,1] DeER . . . . movq %mm0, %mm1 -# CHECK-NEXT: [0,2] .D-R . . . . xorps %xmm0, %xmm0 -# CHECK-NEXT: [0,3] .DeER. . . . movaps %xmm0, %xmm1 -# CHECK-NEXT: [0,4] . DeER . . . movups %xmm1, %xmm2 -# CHECK-NEXT: [0,5] . D=eER . . . movapd %xmm2, %xmm3 -# CHECK-NEXT: [0,6] . D=eER . . . movupd %xmm3, %xmm4 -# CHECK-NEXT: [0,7] . D==eER . . . movdqa %xmm4, %xmm5 -# CHECK-NEXT: [0,8] . D==eER. . . movdqu %xmm5, %xmm0 -# CHECK-NEXT: [1,0] . D----R. . . pxor %mm0, %mm0 -# CHECK-NEXT: [1,1] . DeE--R . . movq %mm0, %mm1 -# CHECK-NEXT: [1,2] . D----R . . xorps %xmm0, %xmm0 -# CHECK-NEXT: [1,3] . .DeE--R . . movaps %xmm0, %xmm1 -# CHECK-NEXT: [1,4] . .D=eE-R . . movups %xmm1, %xmm2 -# CHECK-NEXT: [1,5] . . D=eE-R . . movapd %xmm2, %xmm3 -# CHECK-NEXT: [1,6] . . D==eER . . movupd %xmm3, %xmm4 -# CHECK-NEXT: [1,7] . . D==eER . . movdqa %xmm4, %xmm5 -# CHECK-NEXT: [1,8] . . D===eER. . movdqu %xmm5, %xmm0 -# CHECK-NEXT: [2,0] . . D----R. . pxor %mm0, %mm0 -# CHECK-NEXT: [2,1] . . DeE---R . movq %mm0, %mm1 -# CHECK-NEXT: [2,2] . . D----R . xorps %xmm0, %xmm0 -# CHECK-NEXT: [2,3] . . DeE---R . movaps %xmm0, %xmm1 -# CHECK-NEXT: [2,4] . . .DeE--R . movups %xmm1, %xmm2 -# CHECK-NEXT: [2,5] . . .D=eE--R. movapd %xmm2, %xmm3 -# CHECK-NEXT: [2,6] . . . D=eE-R. movupd %xmm3, %xmm4 -# CHECK-NEXT: [2,7] . . . D==eE-R movdqa %xmm4, %xmm5 -# CHECK-NEXT: [2,8] . . . D==eER movdqu %xmm5, %xmm0 +# CHECK: [0,0] DR . . . pxor %mm0, %mm0 +# CHECK-NEXT: [0,1] DR . . . movq %mm0, %mm1 +# CHECK-NEXT: [0,2] .DR . . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [0,3] .DR . . . movaps %xmm0, %xmm1 +# CHECK-NEXT: [0,4] . DR . . . movups %xmm1, %xmm2 +# CHECK-NEXT: [0,5] . DR . . . movapd %xmm2, %xmm3 +# CHECK-NEXT: [0,6] . DR. . . movupd %xmm3, %xmm4 +# CHECK-NEXT: [0,7] . DR. . . movdqa %xmm4, %xmm5 +# CHECK-NEXT: [0,8] . DR . . movdqu %xmm5, %xmm0 +# CHECK-NEXT: [1,0] . DR . . pxor %mm0, %mm0 +# CHECK-NEXT: [1,1] . DR . . movq %mm0, %mm1 +# CHECK-NEXT: [1,2] . DR . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [1,3] . .DR . . movaps %xmm0, %xmm1 +# CHECK-NEXT: [1,4] . .DR . . movups %xmm1, %xmm2 +# CHECK-NEXT: [1,5] . . DR . . movapd %xmm2, %xmm3 +# CHECK-NEXT: [1,6] . . DR . . movupd %xmm3, %xmm4 +# CHECK-NEXT: [1,7] . . DR. . movdqa %xmm4, %xmm5 +# CHECK-NEXT: [1,8] . . DR. . movdqu %xmm5, %xmm0 +# CHECK-NEXT: [2,0] . . DR . pxor %mm0, %mm0 +# CHECK-NEXT: [2,1] . . DR . movq %mm0, %mm1 +# CHECK-NEXT: [2,2] . . DR . xorps %xmm0, %xmm0 +# CHECK-NEXT: [2,3] . . DR . movaps %xmm0, %xmm1 +# CHECK-NEXT: [2,4] . . .DR . movups %xmm1, %xmm2 +# CHECK-NEXT: [2,5] . . .DR . movapd %xmm2, %xmm3 +# CHECK-NEXT: [2,6] . . . DR. movupd %xmm3, %xmm4 +# CHECK-NEXT: [2,7] . . . DR. movdqa %xmm4, %xmm5 +# CHECK-NEXT: [2,8] . . . DR movdqu %xmm5, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -126,12 +126,12 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 0.0 0.0 2.7 pxor %mm0, %mm0 -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 movq %mm0, %mm1 -# CHECK-NEXT: 2. 3 0.0 0.0 3.0 xorps %xmm0, %xmm0 -# CHECK-NEXT: 3. 3 1.0 1.0 1.7 movaps %xmm0, %xmm1 -# CHECK-NEXT: 4. 3 1.3 0.0 1.0 movups %xmm1, %xmm2 -# CHECK-NEXT: 5. 3 2.0 0.0 1.0 movapd %xmm2, %xmm3 -# CHECK-NEXT: 6. 3 2.3 0.0 0.3 movupd %xmm3, %xmm4 -# CHECK-NEXT: 7. 3 3.0 0.0 0.3 movdqa %xmm4, %xmm5 -# CHECK-NEXT: 8. 3 3.3 0.0 0.0 movdqu %xmm5, %xmm0 +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 pxor %mm0, %mm0 +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 movq %mm0, %mm1 +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 xorps %xmm0, %xmm0 +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 movaps %xmm0, %xmm1 +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 movups %xmm1, %xmm2 +# CHECK-NEXT: 5. 3 0.0 0.0 0.0 movapd %xmm2, %xmm3 +# CHECK-NEXT: 6. 3 0.0 0.0 0.0 movupd %xmm3, %xmm4 +# CHECK-NEXT: 7. 3 0.0 0.0 0.0 movdqa %xmm4, %xmm5 +# CHECK-NEXT: 8. 3 0.0 0.0 0.0 movdqu %xmm5, %xmm0 Index: test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s =================================================================== --- test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s +++ test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s @@ -11,12 +11,12 @@ # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 21 -# CHECK-NEXT: Total Cycles: 16 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 21 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.31 -# CHECK-NEXT: IPC: 1.31 +# CHECK-NEXT: uOps Per Cycle: 1.75 +# CHECK-NEXT: IPC: 1.75 # CHECK-NEXT: Block RThroughput: 3.5 # CHECK: Instruction Info: @@ -37,13 +37,13 @@ # CHECK-NEXT: 1 1 0.50 vmovdqu %xmm5, %xmm0 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 18 -# CHECK-NEXT: Max number of mappings used: 9 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 18 -# CHECK-NEXT: Max number of mappings used: 9 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -68,43 +68,43 @@ # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 2.00 2.00 3.00 3.00 - - - - 1.00 1.00 - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: - - - - 1.00 0.33 0.67 - - - - - - - vmovaps %xmm0, %xmm1 -# CHECK-NEXT: - - - 1.00 - 0.67 0.33 - - - - - - - vmovups %xmm1, %xmm2 -# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmovapd %xmm2, %xmm3 -# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmovupd %xmm3, %xmm4 -# CHECK-NEXT: - - - - - 0.33 0.67 - - - - - 1.00 - vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: - - - - - 0.67 0.33 - - - - 1.00 - - vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovaps %xmm0, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovups %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovapd %xmm2, %xmm3 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovupd %xmm3, %xmm4 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovdqu %xmm5, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 012345 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DR . . . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [0,1] DeER . . . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [0,2] .DeER. . . vmovups %xmm1, %xmm2 -# CHECK-NEXT: [0,3] .D=eER . . vmovapd %xmm2, %xmm3 -# CHECK-NEXT: [0,4] . D=eER . . vmovupd %xmm3, %xmm4 -# CHECK-NEXT: [0,5] . D==eER . . vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: [0,6] . D==eER . . vmovdqu %xmm5, %xmm0 -# CHECK-NEXT: [1,0] . D----R . . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [1,1] . DeE--R. . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [1,2] . D=eE-R. . vmovups %xmm1, %xmm2 -# CHECK-NEXT: [1,3] . D=eE-R . vmovapd %xmm2, %xmm3 -# CHECK-NEXT: [1,4] . D==eER . vmovupd %xmm3, %xmm4 -# CHECK-NEXT: [1,5] . .D==eER . vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: [1,6] . .D===eER . vmovdqu %xmm5, %xmm0 -# CHECK-NEXT: [2,0] . . D----R . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [2,1] . . DeE---R . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [2,2] . . DeE--R . vmovups %xmm1, %xmm2 -# CHECK-NEXT: [2,3] . . D=eE--R. vmovapd %xmm2, %xmm3 -# CHECK-NEXT: [2,4] . . D=eE-R. vmovupd %xmm3, %xmm4 -# CHECK-NEXT: [2,5] . . D==eE-R vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: [2,6] . . D==eER vmovdqu %xmm5, %xmm0 +# CHECK: [0,0] DR . .. vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [0,1] DR . .. vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [0,2] .DR . .. vmovups %xmm1, %xmm2 +# CHECK-NEXT: [0,3] .DR . .. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [0,4] . DR . .. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [0,5] . DR . .. vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [0,6] . DR. .. vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: [1,0] . DR. .. vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [1,1] . DR .. vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [1,2] . DR .. vmovups %xmm1, %xmm2 +# CHECK-NEXT: [1,3] . DR .. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [1,4] . DR .. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [1,5] . .DR .. vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [1,6] . .DR .. vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: [2,0] . . DR .. vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [2,1] . . DR .. vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [2,2] . . DR.. vmovups %xmm1, %xmm2 +# CHECK-NEXT: [2,3] . . DR.. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [2,4] . . DR. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [2,5] . . DR. vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [2,6] . . DR vmovdqu %xmm5, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -113,10 +113,10 @@ # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 vmovaps %xmm0, %xmm1 -# CHECK-NEXT: 2. 3 1.3 0.0 1.0 vmovups %xmm1, %xmm2 -# CHECK-NEXT: 3. 3 2.0 0.0 1.0 vmovapd %xmm2, %xmm3 -# CHECK-NEXT: 4. 3 2.3 0.0 0.3 vmovupd %xmm3, %xmm4 -# CHECK-NEXT: 5. 3 3.0 0.0 0.3 vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: 6. 3 3.3 0.0 0.0 vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 vmovaps %xmm0, %xmm1 +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 vmovups %xmm1, %xmm2 +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 vmovapd %xmm2, %xmm3 +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 vmovupd %xmm3, %xmm4 +# CHECK-NEXT: 5. 3 0.0 0.0 0.0 vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: 6. 3 0.0 0.0 0.0 vmovdqu %xmm5, %xmm0 Index: tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp =================================================================== --- tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -73,7 +73,8 @@ // registers in register file #0 through the command line flag // `-register-file-size`. unsigned RegisterFileIndex = RegisterFiles.size(); - RegisterFiles.emplace_back(RF.NumPhysRegs); + RegisterFiles.emplace_back(RF.NumPhysRegs, RF.MaxMovesEliminatedPerCycle, + RF.AllowZeroMoveEliminationOnly); // Special case where there is no register class identifier in the set. // An empty set of register classes means: this register file contains all @@ -99,6 +100,7 @@ } IPC = std::make_pair(RegisterFileIndex, RCE.Cost); Entry.RenameAs = Reg; + Entry.AllowMoveElimination = RCE.AllowMoveElimination; // Assume the same cost for each sub-register. for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) { Index: tools/llvm-mca/lib/InstrBuilder.cpp =================================================================== --- tools/llvm-mca/lib/InstrBuilder.cpp +++ tools/llvm-mca/lib/InstrBuilder.cpp @@ -463,6 +463,8 @@ bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID); bool IsDepBreaking = IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID); + if (MCIA.isOptimizableRegisterMove(MCI, ProcID)) + NewIS->setOptimizableMove(); // Initialize Reads first. for (const ReadDescriptor &RD : D.Reads) { Index: utils/TableGen/CodeGenSchedule.h =================================================================== --- utils/TableGen/CodeGenSchedule.h +++ utils/TableGen/CodeGenSchedule.h @@ -167,8 +167,9 @@ struct CodeGenRegisterCost { Record *RCDef; unsigned Cost; - CodeGenRegisterCost(Record *RC, unsigned RegisterCost) - : RCDef(RC), Cost(RegisterCost) {} + bool AllowMoveElimination; + CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false) + : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {} CodeGenRegisterCost(const CodeGenRegisterCost &) = default; CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete; }; @@ -181,12 +182,18 @@ struct CodeGenRegisterFile { std::string Name; Record *RegisterFileDef; + unsigned MaxMovesEliminatedPerCycle; + bool AllowZeroMoveEliminationOnly; unsigned NumPhysRegs; std::vector Costs; - CodeGenRegisterFile(StringRef name, Record *def) - : Name(name), RegisterFileDef(def), NumPhysRegs(0) {} + CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0, + bool AllowZeroMoveElimOnly = false) + : Name(name), RegisterFileDef(def), + MaxMovesEliminatedPerCycle(MaxMoveElimPerCy), + AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), + NumPhysRegs(0) {} bool hasDefaultCosts() const { return Costs.empty(); } }; Index: utils/TableGen/CodeGenSchedule.cpp =================================================================== --- utils/TableGen/CodeGenSchedule.cpp +++ utils/TableGen/CodeGenSchedule.cpp @@ -1759,6 +1759,10 @@ CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel")); PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF)); CodeGenRegisterFile &CGRF = PM.RegisterFiles.back(); + CGRF.MaxMovesEliminatedPerCycle = + RF->getValueAsInt("MaxMovesEliminatedPerCycle"); + CGRF.AllowZeroMoveEliminationOnly = + RF->getValueAsBit("AllowZeroMoveEliminationOnly"); // Now set the number of physical registers as well as the cost of registers // in each register class. @@ -1770,9 +1774,17 @@ RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses"); std::vector RegisterCosts = RF->getValueAsListOfInts("RegCosts"); + ListInit *MoveElimInfo = RF->getValueAsListInit("AllowMoveElimination"); for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) { int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1; - CGRF.Costs.emplace_back(RegisterClasses[I], Cost); + + bool AllowMoveElim = false; + if (MoveElimInfo->size() > I) { + BitInit *Val = cast(MoveElimInfo->getElement(I)); + AllowMoveElim = Val->getValue(); + } + + CGRF.Costs.emplace_back(RegisterClasses[I], Cost, AllowMoveElim); } } } Index: utils/TableGen/SubtargetEmitter.cpp =================================================================== --- utils/TableGen/SubtargetEmitter.cpp +++ utils/TableGen/SubtargetEmitter.cpp @@ -653,7 +653,7 @@ return 0; // Print the RegisterCost table first. - OS << "\n// {RegisterClassID, Register Cost}\n"; + OS << "\n// {RegisterClassID, Register Cost, AllowMoveElimination }\n"; OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName << "RegisterCosts" << "[] = {\n"; @@ -668,24 +668,28 @@ Record *Rec = RC.RCDef; if (Rec->getValue("Namespace")) OS << Rec->getValueAsString("Namespace") << "::"; - OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n"; + OS << Rec->getName() << "RegClassID, " << RC.Cost << ", " + << RC.AllowMoveElimination << "},\n"; } } OS << "};\n"; // Now generate a table with register file info. - OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n"; + OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl, " + << "MaxMovesEliminatedPerCycle, AllowZeroMoveEliminationOnly }\n"; OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName << "RegisterFiles" << "[] = {\n" - << " { \"InvalidRegisterFile\", 0, 0, 0 },\n"; + << " { \"InvalidRegisterFile\", 0, 0, 0, 0, 0 },\n"; unsigned CostTblIndex = 0; for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) { OS << " { "; OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", "; unsigned NumCostEntries = RD.Costs.size(); - OS << NumCostEntries << ", " << CostTblIndex << "},\n"; + OS << NumCostEntries << ", " << CostTblIndex << ", " + << RD.MaxMovesEliminatedPerCycle << ", " + << RD.AllowZeroMoveEliminationOnly << "},\n"; CostTblIndex += NumCostEntries; } OS << "};\n";