Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h @@ -367,6 +367,9 @@ insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, unsigned CallClass) const override; + /// Returns true if the instruction has a shift left that can be executed + /// more efficiently. + bool isExynosShiftLeftFast(const MachineInstr &MI) const; /// Returns true if the instruction has a shift by immediate that can be /// executed in one cycle less. bool isFalkorShiftExtFast(const MachineInstr &MI) const; Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -673,8 +673,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { if (!Subtarget.hasCustomCheapAsMoveHandling()) return MI.isAsCheapAsAMove(); - - unsigned Imm; + if (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + isExynosShiftLeftFast(MI)) + return true; switch (MI.getOpcode()) { default: @@ -685,17 +686,7 @@ case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || - MI.getOperand(3).getImm() == 0); - - // add/sub on register with shift - case AArch64::ADDWrs: - case AArch64::ADDXrs: - case AArch64::SUBWrs: - case AArch64::SUBXrs: - Imm = MI.getOperand(3).getImm(); - return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && - AArch64_AM::getArithShiftValue(Imm) < 4); + return (MI.getOperand(3).getImm() == 0); // logical ops on immediate case AArch64::ANDWri: @@ -721,24 +712,6 @@ case AArch64::ORRXrr: return true; - // logical ops on register with shift - case AArch64::ANDWrs: - case AArch64::ANDXrs: - case AArch64::BICWrs: - case AArch64::BICXrs: - case AArch64::EONWrs: - case AArch64::EONXrs: - case AArch64::EORWrs: - case AArch64::EORXrs: - case AArch64::ORNWrs: - case AArch64::ORNXrs: - case AArch64::ORRWrs: - case AArch64::ORRXrs: - Imm = MI.getOperand(3).getImm(); - return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && - AArch64_AM::getShiftValue(Imm) < 4 && - AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); - // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV case AArch64::MOVi32imm: @@ -761,6 +734,74 @@ llvm_unreachable("Unknown opcode to check as cheap as a move!"); } +bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const { + unsigned Imm, Shift; + + switch (MI.getOpcode()) { + default: + return false; + + // WriteI + case AArch64::ADDSWri: + case AArch64::ADDSXri: + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::SUBSWri: + case AArch64::SUBSXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return true; + + // WriteISReg + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + Imm = MI.getOperand(3).getImm(); + Shift = AArch64_AM::getShiftValue(Imm); + return (Shift == 0 || + (Shift <= 3 && AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL)); + + // WriteIEReg + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + Imm = MI.getOperand(3).getImm(); + Shift = AArch64_AM::getArithShiftValue(Imm); + return (Shift == 0 || + (Shift <= 3 && AArch64_AM::getExtendType(Imm) == AArch64_AM::UXTX)); + } +} + bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: Index: llvm/trunk/lib/Target/AArch64/AArch64SchedM1.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64SchedM1.td +++ llvm/trunk/lib/Target/AArch64/AArch64SchedM1.td @@ -62,16 +62,25 @@ let SchedModel = ExynosM1Model in { //===----------------------------------------------------------------------===// -// Coarse scheduling model for the Exynos-M1. +// Predicates. + +def M1ShiftLeftFastPred : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>; + +//===----------------------------------------------------------------------===// +// Coarse scheduling model. def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; } +def M1WriteAA : SchedWriteRes<[M1UnitALU]> { let Latency = 2; + let ResourceCycles = [2]; } +def M1WriteAX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } -def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } +def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } -def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } +def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } def M1WriteLX : SchedWriteVariant<[SchedVar, SchedVar]>; @@ -85,7 +94,6 @@ def M1ReadAdrBase : SchedReadVariant<[SchedVar, SchedVar]>; -def : SchedAlias; // Branch instructions. // NOTE: Unconditional direct branches actually take neither cycles nor units. @@ -94,7 +102,6 @@ // Arithmetic and logical integer instructions. def : WriteRes { let Latency = 1; } -// TODO: Shift over 3 and some extensions take 2 cycles. def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } @@ -110,7 +117,6 @@ M1UnitD]> { let Latency = 21; let ResourceCycles = [1, 21]; } // TODO: Long multiplication take 5 cycles and also the ALU. -// TODO: Multiplication with accumulation can be advanced. def : WriteRes { let Latency = 3; } // TODO: 64-bit multiplication has a throughput of 1/2. def : WriteRes { let Latency = 4; } @@ -119,8 +125,10 @@ def : WriteRes { let Latency = 2; } +// Addressing modes. // TODO: The latency for the post or pre register is 1 cycle. def : WriteRes { let Latency = 0; } +def : SchedAlias; // Load instructions. def : WriteRes { let Latency = 4; } @@ -164,12 +172,10 @@ // Generic fast forwarding. // TODO: Add FP register forwarding rules. - def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; -// Integer multiply-accumulate. // TODO: The forwarding for WriteIM64 saves actually 3 cycles. def : ReadAdvance; def : ReadAdvance; @@ -178,7 +184,7 @@ def : ReadAdvance; //===----------------------------------------------------------------------===// -// Finer scheduling model for the Exynos-M1. +// Finer scheduling model. def M1WriteNEONA : SchedWriteRes<[M1UnitNALU, M1UnitNALU, @@ -287,7 +293,6 @@ M1UnitL, M1UnitL]> { let Latency = 14; let ResourceCycles = [7]; } - def M1WriteVSTA : WriteSequence<[WriteVST], 2>; def M1WriteVSTB : WriteSequence<[WriteVST], 3>; def M1WriteVSTC : WriteSequence<[WriteVST], 4>; @@ -340,7 +345,6 @@ // Branch instructions def : InstRW<[M1WriteB1], (instrs Bcc)>; -// NOTE: Conditional branch and link adds a B uop. def : InstRW<[M1WriteA1], (instrs BL)>; // NOTE: Indirect branch and link with LR adds an ALU uop. def : InstRW<[M1WriteA1, @@ -351,6 +355,7 @@ // Arithmetic and logical integer instructions. def : InstRW<[M1WriteA1], (instrs COPY)>; +def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>; // Divide and multiply instructions. @@ -413,10 +418,12 @@ def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>; def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>; def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>; -def : InstRW<[M1WriteNALU1], (instregex "^[SU]?SH(L|LL|R)2?v")>; -def : InstRW<[M1WriteNALU1], (instregex "^S[LR]Iv")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU](Q|QR|R)SHLU?v")>; +def : InstRW<[M1WriteNALU1], (instregex "^SHL[dv]")>; +def : InstRW<[M1WriteNALU1], (instregex "^[SU]SH[LR][dv]")>; +def : InstRW<[M1WriteNALU1], (instregex "^S[RS]I[dv]")>; +def : InstRW<[M1WriteNAL13], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]RSH[LR][dv]")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]QR?SHLU?[bdhsv]")>; // ASIMD FP instructions. def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>;