diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -60,12 +60,13 @@ MachineLoopInfo *MLI; MachineRegisterInfo *MRI; + using OpcodePair = std::pair; template using SplitAndOpcFunc = - std::function(T, unsigned, T &, T &)>; + std::function(T, unsigned, T &, T &)>; using BuildMIFunc = - std::function; + std::function; /// For instructions where an immediate operand could be split into two /// separate immediate instructions, use the splitTwoPartImm two handle the @@ -93,6 +94,10 @@ bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI, SmallSetVector &ToBeRemoved); template + bool visitADDSSUBS(OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI, + SmallSetVector &ToBeRemoved); + + template bool visitAND(unsigned Opc, MachineInstr &MI, SmallSetVector &ToBeRemoved); bool visitORR(MachineInstr &MI, @@ -171,20 +176,20 @@ return splitTwoPartImm( MI, ToBeRemoved, - [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional { + [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional { if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1)) - return Opc; + return std::make_pair(Opc, Opc); return None; }, - [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, + [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, unsigned Imm1, Register SrcReg, Register NewTmpReg, Register NewDstReg) { DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock *MBB = MI.getParent(); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) .addReg(SrcReg) .addImm(Imm0); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) .addReg(NewTmpReg) .addImm(Imm1); }); @@ -273,23 +278,107 @@ return splitTwoPartImm( MI, ToBeRemoved, [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0, - T &Imm1) -> Optional { + T &Imm1) -> Optional { if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) - return PosOpc; + return std::make_pair(PosOpc, PosOpc); if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) - return NegOpc; + return std::make_pair(NegOpc, NegOpc); return None; }, - [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0, + [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, unsigned Imm1, Register SrcReg, Register NewTmpReg, Register NewDstReg) { DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock *MBB = MI.getParent(); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) .addReg(SrcReg) .addImm(Imm0) .addImm(12); - BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) + .addReg(NewTmpReg) + .addImm(Imm1) + .addImm(0); + }); +} + +static bool verifyAddsSubsCCUses(MachineInstr &MI, + const AArch64RegisterInfo *TRI) { + assert(MI.getNumImplicitOperands() == 1 && + "Instruction must only implicit define NZCV"); + Register NZCV = MI.implicit_operands().begin()->getReg(); + for (MachineBasicBlock::iterator Iter = std::next(MI.getIterator()), + E = MI.getParent()->end(); + Iter != E; ++Iter) { + bool ReadsNZCV = Iter->readsRegister(NZCV, TRI); + bool WritesNZCV = Iter->definesRegister(NZCV, TRI); + if (!ReadsNZCV && !WritesNZCV) + continue; + + // Instruction defines NZCV state without using NZCV so we can stop scanning + if (!ReadsNZCV) + break; + + AArch64CC::CondCode CC; + switch (Iter->getOpcode()) { + default: + // If an instruction that used NZCV + return false; + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + CC = static_cast(Iter->getOperand(3).getImm()); + break; + case AArch64::Bcc: + CC = static_cast(Iter->getOperand(0).getImm()); + break; + } + // Only valid if Condition Code is Equal or Not Equal + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return false; + + // Instruction defined NZCV so we can stop scanning + if (WritesNZCV) + break; + } + return true; +} + +template +bool AArch64MIPeepholeOpt::visitADDSSUBS( + OpcodePair PosOpcs, OpcodePair NegOpcs, MachineInstr &MI, + SmallSetVector &ToBeRemoved) { + // Try the same transformation as ADDSUB but with additional requirement + // that the condition code usages are only for Equal and Not Equal + return splitTwoPartImm( + MI, ToBeRemoved, + [PosOpcs, NegOpcs, &MI, &TRI = TRI](T Imm, unsigned RegSize, T &Imm0, + T &Imm1) -> Optional { + OpcodePair OP; + if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) + OP = PosOpcs; + else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) + OP = NegOpcs; + else + return None; + // Check conditional uses last since it is expensive for scanning + // proceeding instructions + if (verifyAddsSubsCCUses(MI, TRI)) + return OP; + return None; + }, + [&TII = TII](MachineInstr &MI, OpcodePair Opcode, unsigned Imm0, + unsigned Imm1, Register SrcReg, Register NewTmpReg, + Register NewDstReg) { + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + BuildMI(*MBB, MI, DL, TII->get(Opcode.first), NewTmpReg) + .addReg(SrcReg) + .addImm(Imm0) + .addImm(12); + BuildMI(*MBB, MI, DL, TII->get(Opcode.second), NewDstReg) .addReg(NewTmpReg) .addImm(Imm1) .addImm(0); @@ -357,32 +446,49 @@ // number since it was sign extended when we assign to the 64-bit Imm. if (SubregToRegMI) Imm &= 0xFFFFFFFF; - unsigned Opcode; + OpcodePair Opcode; if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1)) Opcode = R.getValue(); else return false; - // Create new ADD/SUB MIs. + // Create new MIs using the first and second opcodes. Opcodes might differ for + // flag setting operations that should only set flags on second instruction. + // NewTmpReg = Opcode.first SrcReg Imm0 + // NewDstReg = Opcode.second NewTmpReg Imm1 + + // Determine register classes for destinations and register operands MachineFunction *MF = MI.getMF(); - const TargetRegisterClass *RC = - TII->getRegClass(TII->get(Opcode), 0, TRI, *MF); - const TargetRegisterClass *ORC = - TII->getRegClass(TII->get(Opcode), 1, TRI, *MF); + const TargetRegisterClass *FirstInstrDstRC = + TII->getRegClass(TII->get(Opcode.first), 0, TRI, *MF); + const TargetRegisterClass *FirstInstrOperandRC = + TII->getRegClass(TII->get(Opcode.first), 1, TRI, *MF); + const TargetRegisterClass *SecondInstrDstRC = + (Opcode.first == Opcode.second) + ? FirstInstrDstRC + : TII->getRegClass(TII->get(Opcode.second), 0, TRI, *MF); + const TargetRegisterClass *SecondInstrOperandRC = + (Opcode.first == Opcode.second) + ? FirstInstrOperandRC + : TII->getRegClass(TII->get(Opcode.second), 1, TRI, *MF); + + // Get old registers destinations and new register destinations Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - Register NewTmpReg = MRI->createVirtualRegister(RC); - Register NewDstReg = MRI->createVirtualRegister(RC); + Register NewTmpReg = MRI->createVirtualRegister(FirstInstrDstRC); + Register NewDstReg = MRI->createVirtualRegister(SecondInstrDstRC); - MRI->constrainRegClass(SrcReg, RC); - MRI->constrainRegClass(NewTmpReg, ORC); + // Constrain registers based on their new uses + MRI->constrainRegClass(SrcReg, FirstInstrOperandRC); + MRI->constrainRegClass(NewTmpReg, SecondInstrOperandRC); MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg)); + // Call the delegating operation to build the instruction BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg); - MRI->replaceRegWith(DstReg, NewDstReg); // replaceRegWith changes MI's definition register. Keep it for SSA form until // deleting MI. + MRI->replaceRegWith(DstReg, NewDstReg); MI.getOperand(0).setReg(DstReg); // Record the MIs need to be removed. @@ -439,6 +545,26 @@ Changed = visitADDSUB(AArch64::SUBXri, AArch64::ADDXri, MI, ToBeRemoved); break; + case AArch64::ADDSWrr: + Changed = visitADDSSUBS({AArch64::ADDWri, AArch64::ADDSWri}, + {AArch64::SUBWri, AArch64::SUBSWri}, + MI, ToBeRemoved); + break; + case AArch64::SUBSWrr: + Changed = visitADDSSUBS({AArch64::SUBWri, AArch64::SUBSWri}, + {AArch64::ADDWri, AArch64::ADDSWri}, + MI, ToBeRemoved); + break; + case AArch64::ADDSXrr: + Changed = visitADDSSUBS({AArch64::ADDXri, AArch64::ADDSXri}, + {AArch64::SUBXri, AArch64::SUBSXri}, + MI, ToBeRemoved); + break; + case AArch64::SUBSXrr: + Changed = visitADDSSUBS({AArch64::SUBXri, AArch64::SUBSXri}, + {AArch64::ADDXri, AArch64::ADDSXri}, + MI, ToBeRemoved); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll --- a/llvm/test/CodeGen/AArch64/addsub.ll +++ b/llvm/test/CodeGen/AArch64/addsub.ll @@ -406,4 +406,216 @@ ret i64 %b } -; TODO: adds/subs +; ADDS and SUBS Optimizations +; Checks with all types first, then checks that only EQ and NE optimize +define i1 @eq_i(i32 %0) { +; CHECK-LABEL: eq_i: +; CHECK: // %bb.0: +; CHECK-NEXT: sub w8, w0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmp w8, #273 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp eq i32 %0, 1118481 + ret i1 %2 +} + +define i1 @eq_l(i64 %0) { +; CHECK-LABEL: eq_l: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmp x8, #273 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp eq i64 %0, 1118481 + ret i1 %2 +} + +define i1 @ne_i(i32 %0) { +; CHECK-LABEL: ne_i: +; CHECK: // %bb.0: +; CHECK-NEXT: sub w8, w0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmp w8, #273 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %2 = icmp ne i32 %0, 1118481 + ret i1 %2 +} + +define i1 @ne_l(i64 %0) { +; CHECK-LABEL: ne_l: +; CHECK: // %bb.0: +; CHECK-NEXT: sub x8, x0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmp x8, #273 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %2 = icmp ne i64 %0, 1118481 + ret i1 %2 +} + +define i1 @eq_in(i32 %0) { +; CHECK-LABEL: eq_in: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmn w8, #273 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp eq i32 %0, -1118481 + ret i1 %2 +} + +define i1 @eq_ln(i64 %0) { +; CHECK-LABEL: eq_ln: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmn x8, #273 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp eq i64 %0, -1118481 + ret i1 %2 +} + +define i1 @ne_in(i32 %0) { +; CHECK-LABEL: ne_in: +; CHECK: // %bb.0: +; CHECK-NEXT: add w8, w0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmn w8, #273 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %2 = icmp ne i32 %0, -1118481 + ret i1 %2 +} + +define i1 @ne_ln(i64 %0) { +; CHECK-LABEL: ne_ln: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, #273, lsl #12 // =1118208 +; CHECK-NEXT: cmn x8, #273 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %2 = icmp ne i64 %0, -1118481 + ret i1 %2 +} + +define i1 @reject_eq(i32 %0) { +; CHECK-LABEL: reject_eq: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #51712 +; CHECK-NEXT: movk w8, #15258, lsl #16 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %2 = icmp eq i32 %0, 1000000000 + ret i1 %2 +} + +define i1 @reject_non_eqne_csinc(i32 %0) { +; CHECK-LABEL: reject_non_eqne_csinc: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4369 +; CHECK-NEXT: movk w8, #17, lsl #16 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: ret + %2 = icmp ult i32 %0, 1118481 + ret i1 %2 +} + +define i32 @accept_csel(i32 %0) { +; CHECK-LABEL: accept_csel: +; CHECK: // %bb.0: +; CHECK-NEXT: sub w9, w0, #273, lsl #12 // =1118208 +; CHECK-NEXT: mov w8, #17 +; CHECK-NEXT: cmp w9, #273 +; CHECK-NEXT: mov w9, #11 +; CHECK-NEXT: csel w0, w9, w8, eq +; CHECK-NEXT: ret + %2 = icmp eq i32 %0, 1118481 + %3 = select i1 %2, i32 11, i32 17 + ret i32 %3 +} + +define i32 @reject_non_eqne_csel(i32 %0) { +; CHECK-LABEL: reject_non_eqne_csel: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4369 +; CHECK-NEXT: mov w9, #11 +; CHECK-NEXT: movk w8, #17, lsl #16 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #17 +; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: ret + %2 = icmp ult i32 %0, 1118481 + %3 = select i1 %2, i32 11, i32 17 + ret i32 %3 +} + +declare void @fooy() + +define void @accept_branch(i32 %0) { +; CHECK-LABEL: accept_branch: +; CHECK: // %bb.0: +; CHECK-NEXT: sub w8, w0, #291, lsl #12 // =1191936 +; CHECK-NEXT: cmp w8, #1110 +; CHECK-NEXT: b.eq .LBB32_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: b fooy + %2 = icmp ne i32 %0, 1193046 + br i1 %2, label %4, label %3 +3: ; preds = %1 + tail call void @fooy() + br label %4 +4: ; preds = %3, %1 + ret void +} + +define void @reject_non_eqne_branch(i32 %0) { +; CHECK-LABEL: reject_non_eqne_branch: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #13398 +; CHECK-NEXT: movk w8, #18, lsl #16 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: b.le .LBB33_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: b fooy + %2 = icmp sgt i32 %0, 1193046 + br i1 %2, label %4, label %3 +3: ; preds = %1 + tail call void @fooy() + br label %4 +4: ; preds = %3, %1 + ret void +} + +define i32 @reject_multiple_usages(i32 %0) { +; CHECK-LABEL: reject_multiple_usages: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #4369 +; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: movk w8, #17, lsl #16 +; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #9 +; CHECK-NEXT: mov w11, #12 +; CHECK-NEXT: csel w8, w8, w9, eq +; CHECK-NEXT: csel w9, w11, w10, hi +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #53312 +; CHECK-NEXT: movk w9, #2, lsl #16 +; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: mov w9, #26304 +; CHECK-NEXT: movk w9, #1433, lsl #16 +; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: ret + %2 = icmp eq i32 %0, 1118481 + %3 = icmp ugt i32 %0, 1118481 + %4 = select i1 %2, i32 9, i32 3 + %5 = select i1 %3, i32 12, i32 17 + %6 = add i32 %4, %5 + %7 = icmp ugt i32 %0, 184384 + %8 = select i1 %7, i32 %6, i32 93939392 + ret i32 %8 +} diff --git a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll --- a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll +++ b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll @@ -12,8 +12,8 @@ ; YAML: - INST_add: '2' ; YAML: - INST_b.: '1' ; YAML: - INST_ldr: '1' -; YAML: - INST_movk: '1' -; YAML: - INST_movz: '1' +; YAML: - INST_orr: '1' +; YAML: - INST_sub: '1' ; YAML: - INST_subs: '1' ; YAML: Name: InstructionMix @@ -27,13 +27,12 @@ define i32 @foo(i32* %ptr, i32 %x, i64 %y) !dbg !3 { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldr w10, [x0] +; CHECK-NEXT: ldr w9, [x0] ; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w9, #16959 -; CHECK-NEXT: movk w9, #15, lsl #16 -; CHECK-NEXT: add w0, w10, w1 -; CHECK-NEXT: add x10, x0, x2 -; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: add w0, w9, w1 +; CHECK-NEXT: add x9, x0, x2 +; CHECK-NEXT: sub x9, x9, #244, lsl #12 ; =999424 +; CHECK-NEXT: cmp x9, #575 ; CHECK-NEXT: b.eq LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %else ; CHECK-NEXT: mul w9, w0, w1