Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -38,7 +38,10 @@ MULSUBX_OP1, MULSUBX_OP2, MULADDXI_OP1, - MULSUBXI_OP1 + MULSUBXI_OP1, + + KRYO_LSL_ADDWS, + KRYO_LSL_ADDXS, }; } // end namespace llvm Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2656,6 +2656,92 @@ } } +static bool isCannonicalizeCandidate(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned LSLOpc, unsigned &ShiftVal) { + MachineInstr *MI = nullptr; + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + MI = MRI.getUniqueVRegDef(MO.getReg()); + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != LSLOpc) + return false; + // Must only used by the user we cannonicalize with. + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + return false; + + const MachineOperand &Op2 = MI->getOperand(2); + const MachineOperand &Op3 = MI->getOperand(3); + + if (!Op2.isImm() || !Op3.isImm()) + return false; + + int64_t immr = Op2.getImm(); + int64_t imms = Op3.getImm(); + if (LSLOpc == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) + ShiftVal = 31 - imms; + else if (LSLOpc == AArch64::UBFMXri && imms != 0x3F && ((imms + 1 == immr))) + ShiftVal = 63 - imms; + else + return false; + + return true; +} + +// In Kryo if an commutative instruction has a LSL for both operands and if the +// LSL can be folded into the instruction's shifted register (e.g., add x0, x1, +// x2, lsl #3) then we should canonicalize the operands so the smaller (in terms +// of the number of shifts) is the operands that is folded. +// +// For example, rather than +// +// lsl x1, x1, #1 +// add x0, x1, x2, lsl #4 +// +// we should prefer +// +// lsl x2, x2, #4 +// add x0, x2, x1, lsl #1 +// +// as this safes a cycle on the add instruction. +static bool +getCannonicalizePatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + unsigned Opc = Root.getOpcode(); + MachineBasicBlock &MBB = *Root.getParent(); + unsigned LSLOpc; + unsigned ShiftVal1, ShiftVal2; + MachineCombinerPattern Pattern; + switch (Opc) { + default: + return false; + case AArch64::ADDWrs: + LSLOpc = AArch64::UBFMWri; + Pattern = MachineCombinerPattern::KRYO_LSL_ADDWS; + break; + case AArch64::ADDXrs: + LSLOpc = AArch64::UBFMXri; + Pattern = MachineCombinerPattern::KRYO_LSL_ADDXS; + break; + } + + if (!isCannonicalizeCandidate(MBB, Root.getOperand(1), LSLOpc, ShiftVal1)) + return false; + + if (!Root.getOperand(3).isImm()) + return false; + unsigned val = Root.getOperand(3).getImm(); + if (!val || AArch64_AM::getShiftType(val) != AArch64_AM::LSL) + return false; + ShiftVal2 = AArch64_AM::getShiftValue(val); + + if (ShiftVal1 >= ShiftVal2) + return false; + + Patterns.push_back(Pattern); + return true; +} + /// Find instructions that can be turned into madd. static bool getMaddPatterns(MachineInstr &Root, SmallVectorImpl &Patterns) { @@ -2771,12 +2857,94 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl &Patterns) const { + if (Subtarget.isKryo() && getCannonicalizePatterns(Root, Patterns)) + return true; + if (getMaddPatterns(Root, Patterns)) return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } +// Cannonicalize from +// B = LSL A, #shift1 +// D = ADD B, C, lsl #shift2 +// +// to +// B = LSL C, #shift2 +// D = ADD B, A, lsl #shift1 +static void CannonicalizeOperands(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, + MachineInstr &Root, + MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs) { + MachineInstr *LSL = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + const TargetRegisterClass *RC; + + unsigned ShiftVal1, ShiftVal2; + ShiftVal2 = AArch64_AM::getShiftValue(Root.getOperand(3).getImm()); + unsigned ImmR2; + int64_t ImmS1 = LSL->getOperand(3).getImm(); + switch (Pattern) { + case MachineCombinerPattern::KRYO_LSL_ADDWS: + ShiftVal1 = 31 - ImmS1; + ImmR2 = 32 - ShiftVal2; + RC = &AArch64::GPR32RegClass; + break; + case MachineCombinerPattern::KRYO_LSL_ADDXS: + ShiftVal1 = 63 - ImmS1; + ImmR2 = 64 - ShiftVal2; + RC = &AArch64::GPR64RegClass; + break; + default: + llvm_unreachable("unexpected MachineCombinerPattern"); + } + unsigned ImmS2 = ImmR2 - 1; + + MachineOperand &OpA = LSL->getOperand(1); + MachineOperand &OpB = LSL->getOperand(0); + MachineOperand &OpC = Root.getOperand(2); + MachineOperand &OpD = Root.getOperand(0); + + unsigned RegA = OpA.getReg(); + unsigned RegB = OpB.getReg(); + unsigned RegC = OpC.getReg(); + unsigned RegD = OpD.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(RegA)) + MRI.constrainRegClass(RegA, RC); + if (TargetRegisterInfo::isVirtualRegister(RegB)) + MRI.constrainRegClass(RegB, RC); + if (TargetRegisterInfo::isVirtualRegister(RegC)) + MRI.constrainRegClass(RegC, RC); + if (TargetRegisterInfo::isVirtualRegister(RegD)) + MRI.constrainRegClass(RegD, RC); + + unsigned AddOpc = Root.getOpcode(); + unsigned LSLOpc = LSL->getOpcode(); + bool KillA = OpA.isKill(); + bool KillB = OpB.isKill(); + bool KillC = OpC.isKill(); + + // Create new instructions for insertion. + MachineInstrBuilder MIB1 = + BuildMI(MF, LSL->getDebugLoc(), TII->get(LSLOpc), RegB) + .addReg(RegC, getKillRegState(KillC)) + .addImm(ImmR2) + .addImm(ImmS2); + MachineInstrBuilder MIB2 = + BuildMI(MF, Root.getDebugLoc(), TII->get(AddOpc), RegD) + .addReg(RegB, getKillRegState(KillB)) + .addReg(RegA, getKillRegState(KillA)) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftVal1)); + + InsInstrs.push_back(MIB1); + InsInstrs.push_back(MIB2); + DelInstrs.push_back(LSL); + DelInstrs.push_back(&Root); +} + /// genMadd - Generate madd instruction and combine mul and add. /// Example: /// MUL I=A,B,0 @@ -2894,6 +3062,10 @@ TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); return; + case MachineCombinerPattern::KRYO_LSL_ADDWS: + case MachineCombinerPattern::KRYO_LSL_ADDXS: + CannonicalizeOperands(MF, MRI, TII, Root, Pattern, InsInstrs, DelInstrs); + return; case MachineCombinerPattern::MULADDW_OP1: case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 Index: test/CodeGen/AArch64/kryo-lsl-addrs.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/kryo-lsl-addrs.ll @@ -0,0 +1,55 @@ +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s + +; Verify that the shift amount in the add instruction is alwarys the smaller +; one. + +define i32 @lsl_add1(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_add1: +; CHECK: lsl w8, w0, #3 +; CHECK-NEXT: add w0, w8, w1, lsl #2 +; CHECK-NEXT: ret +entry: + %shl = shl i32 %a, 3 + %shl1 = shl i32 %b, 2 + %add = add i32 %shl1, %shl + ret i32 %add +} + +define i32 @lsl_add2(i32 %a, i32 %b) { +; CHECK-LABEL: lsl_add2: +; CHECK: lsl w8, w1, #3 +; CHECK-NEXT: add w0, w8, w0, lsl #2 +; CHECK-NEXT: ret + +entry: + %shl = shl i32 %a, 2 + %shl1 = shl i32 %b, 3 + %add = add i32 %shl1, %shl + ret i32 %add +} + +define i64 @lsl_add3(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_add3: +; CHECK: lsl x8, x0, #3 +; CHECK-NEXT: add x0, x8, x1, lsl #2 +; CHECK-NEXT: ret + +entry: + %shl = shl i64 %a, 3 + %shl1 = shl i64 %b, 2 + %add = add i64 %shl1, %shl + ret i64 %add +} + +define i64 @lsl_add4(i64 %a, i64 %b) { +; CHECK-LABEL: lsl_add4: +; CHECK: lsl x8, x1, #3 +; CHECK-NEXT: add x0, x8, x0, lsl #2 +; CHECK-NEXT: ret +entry: + %shl = shl i64 %a, 2 + %shl1 = shl i64 %b, 3 + %add = add i64 %shl1, %shl + ret i64 %add +} +