diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -11,10 +11,17 @@ // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri // MOVi64imm + ANDXrr ==> ANDXri + ANDXri // +// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi +// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// +// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi +// MOVi64imm + SUBXrr ==> SUBXri + SUBXri +// // The mov pseudo instruction could be expanded to multiple mov instructions // later. In this case, we could try to split the constant operand of mov -// instruction into two bitmask immediates. It makes two AND instructions -// intead of multiple `mov` + `and` instructions. +// instruction into two immediates which can be directly encoded into +// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of +// multiple `mov` + `and/add/sub` instructions. //===----------------------------------------------------------------------===// #include "AArch64ExpandImm.h" @@ -41,6 +48,13 @@ MachineLoopInfo *MLI; MachineRegisterInfo *MRI; + bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, + MachineInstr *&SubregToRegMI); + + template + bool visitADDSUB(MachineInstr &MI, + SmallSetVector &ToBeRemoved, bool IsAdd); + template bool visitAND(MachineInstr &MI, SmallSetVector &ToBeRemoved); @@ -119,31 +133,8 @@ assert((RegSize == 32 || RegSize == 64) && "Invalid RegSize for AND bitmask peephole optimization"); - // Check whether AND's MBB is in loop and the AND is loop invariant. - MachineBasicBlock *MBB = MI.getParent(); - MachineLoop *L = MLI->getLoopFor(MBB); - if (L && !L->isLoopInvariant(MI)) - return false; - - // Check whether AND's operand is MOV with immediate. - MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); - MachineInstr *SubregToRegMI = nullptr; - // If it is SUBREG_TO_REG, check its operand. - if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { - SubregToRegMI = MovMI; - MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); - } - - if (MovMI->getOpcode() != AArch64::MOVi32imm && - MovMI->getOpcode() != AArch64::MOVi64imm) - return false; - - // If the MOV has multiple uses, do not split the immediate because it causes - // more instructions. - if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) - return false; - - if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) + MachineInstr *MovMI, *SubregToRegMI; + if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) return false; // Split the bitmask immediate into two. @@ -155,6 +146,7 @@ // Create new AND MIs. DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); const TargetRegisterClass *ANDImmRC = (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass; Register DstReg = MI.getOperand(0).getReg(); @@ -180,6 +172,135 @@ return true; } +template +static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { + // The immediate must be in the form of ((imm0 << 12) + imm1), in which both + // imm0 and imm1 are non-zero 12-bit unsigned int. + if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || + (Imm & ~static_cast(0xffffff)) != 0) + return false; + + // The immediate can not be composed via a single instruction. + SmallVector Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return false; + + // Split Imm into (Imm0 << 12) + Imm1; + Imm0 = (Imm >> 12) & 0xfff; + Imm1 = Imm & 0xfff; + return true; +} + +template +bool AArch64MIPeepholeOpt::visitADDSUB( + MachineInstr &MI, SmallSetVector &ToBeRemoved, + bool IsAdd) { + // Try below transformation. + // + // MOVi32imm + ADDWrr ==> ANDWri + ANDWri + // MOVi64imm + ADDXrr ==> ANDXri + ANDXri + // + // MOVi32imm + SUBWrr ==> SUBWri + SUBWri + // MOVi64imm + SUBXrr ==> SUBXri + SUBXri + // + // The mov pseudo instruction could be expanded to multiple mov instructions + // later. Let's try to split the constant operand of mov instruction into two + // legal add/sub immediates. It makes only two ADD/SUB instructions intead of + // multiple `mov` + `and/sub` instructions. + + unsigned RegSize = sizeof(T) * 8; + assert((RegSize == 32 || RegSize == 64) && + "Invalid RegSize for legal add/sub immediate peephole optimization"); + + // Check if the immediate satisfies several conditions. + MachineInstr *MovMI, *SubregToRegMI; + if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) + return false; + + // Split the immediate to Imm0 and Imm1, and calculate the Opcode. + T Imm = static_cast(MovMI->getOperand(1).getImm()), Imm0, Imm1; + unsigned Opcode; + if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) { + if (IsAdd) + Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri; + else + Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri; + } else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) { + if (IsAdd) + Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri; + else + Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri; + } else { + return false; + } + + // Create new ADD/SUB MIs. + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *RC = + (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass; + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register TmpReg = MRI->createVirtualRegister(RC); + + MRI->constrainRegClass(SrcReg, RC); + BuildMI(*MBB, MI, DL, TII->get(Opcode), TmpReg) + .addReg(SrcReg) + .addImm(Imm0) + .addImm(12); + + MRI->constrainRegClass(DstReg, RC); + BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg) + .addReg(TmpReg) + .addImm(Imm1) + .addImm(0); + + // Record the MIs need to be removed. + ToBeRemoved.insert(&MI); + if (SubregToRegMI) + ToBeRemoved.insert(SubregToRegMI); + ToBeRemoved.insert(MovMI); + + return true; +} + +// Checks if the corresponding MOV immediate instruction is applicable for +// this peephole optimization. +bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, + MachineInstr *&MovMI, + MachineInstr *&SubregToRegMI) { + // Check whether current MI is in loop and it is loop invariant. + MachineBasicBlock *MBB = MI.getParent(); + MachineLoop *L = MLI->getLoopFor(MBB); + if (L && !L->isLoopInvariant(MI)) + return false; + + // Check whether current MI's operand is MOV with immediate. + MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + SubregToRegMI = nullptr; + // If it is SUBREG_TO_REG, check its operand. + if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { + SubregToRegMI = MovMI; + MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); + } + + if (MovMI->getOpcode() != AArch64::MOVi32imm && + MovMI->getOpcode() != AArch64::MOVi64imm) + return false; + + // If the MOV has multiple uses, do not split the immediate because it causes + // more instructions. + if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) + return false; + + if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) + return false; + + // It is OK to perform this peephole optimization. + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -205,6 +326,18 @@ case AArch64::ANDXrr: Changed = visitAND(MI, ToBeRemoved); break; + case AArch64::ADDWrr: + Changed = visitADDSUB(MI, ToBeRemoved, true); + break; + case AArch64::SUBWrr: + Changed = visitADDSUB(MI, ToBeRemoved, false); + break; + case AArch64::ADDXrr: + Changed = visitADDSUB(MI, ToBeRemoved, true); + break; + case AArch64::SUBXrr: + Changed = visitADDSUB(MI, ToBeRemoved, false); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll --- a/llvm/test/CodeGen/AArch64/addsub.ll +++ b/llvm/test/CodeGen/AArch64/addsub.ll @@ -152,9 +152,8 @@ define i64 @add_two_parts_imm_i64(i64 %a) { ; CHECK-LABEL: add_two_parts_imm_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: add x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add x0, x8, #1365 ; CHECK-NEXT: ret %b = add i64 %a, 11183445 ret i64 %b @@ -163,9 +162,8 @@ define i32 @add_two_parts_imm_i32(i32 %a) { ; CHECK-LABEL: add_two_parts_imm_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: add w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add w0, w8, #1365 ; CHECK-NEXT: ret %b = add i32 %a, 11183445 ret i32 %b @@ -174,9 +172,8 @@ define i64 @add_two_parts_imm_i64_neg(i64 %a) { ; CHECK-LABEL: add_two_parts_imm_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-42325 -; CHECK-NEXT: movk x8, #65365, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: sub x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub x0, x8, #1365 ; CHECK-NEXT: ret %b = add i64 %a, -11183445 ret i64 %b @@ -185,9 +182,8 @@ define i32 @add_two_parts_imm_i32_neg(i32 %a) { ; CHECK-LABEL: add_two_parts_imm_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23211 -; CHECK-NEXT: movk w8, #65365, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: sub w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub w0, w8, #1365 ; CHECK-NEXT: ret %b = add i32 %a, -11183445 ret i32 %b @@ -196,9 +192,8 @@ define i64 @sub_two_parts_imm_i64(i64 %a) { ; CHECK-LABEL: sub_two_parts_imm_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-42325 -; CHECK-NEXT: movk x8, #65365, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: sub x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub x0, x8, #1365 ; CHECK-NEXT: ret %b = sub i64 %a, 11183445 ret i64 %b @@ -207,9 +202,8 @@ define i32 @sub_two_parts_imm_i32(i32 %a) { ; CHECK-LABEL: sub_two_parts_imm_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23211 -; CHECK-NEXT: movk w8, #65365, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: sub w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub w0, w8, #1365 ; CHECK-NEXT: ret %b = sub i32 %a, 11183445 ret i32 %b @@ -218,9 +212,8 @@ define i64 @sub_two_parts_imm_i64_neg(i64 %a) { ; CHECK-LABEL: sub_two_parts_imm_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: add x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add x0, x8, #1365 ; CHECK-NEXT: ret %b = sub i64 %a, -11183445 ret i64 %b @@ -229,9 +222,8 @@ define i32 @sub_two_parts_imm_i32_neg(i32 %a) { ; CHECK-LABEL: sub_two_parts_imm_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: add w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add w0, w8, #1365 ; CHECK-NEXT: ret %b = sub i32 %a, -11183445 ret i32 %b diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -135,8 +135,12 @@ while_cond: %phi = phi i32 [ 0, %entry ], [ %i, %while_body ] -; CHECK: mov w{{[0-9]+}}, #14464 +; CHECK-NOT: mov w{{[0-9]+}}, #14464 ; CHECK-NOT: mov w{{[0-9]+}}, #14468 +; CHECK-NOT: movk w{{[0-9]+}}, #1, lsl #16 +; CHECK: add x{{[0-9]+}}, x{{[0-9]+}}, #19, lsl #12 +; CHECK: add x{{[0-9]+}}, x{{[0-9]+}}, #2176 +; CHECK-NOT: add x{{[0-9]+}}, x{{[0-9]+}}, #2180 %gep0 = getelementptr [65536 x i32], [65536 x i32]* %struct, i64 0, i32 20000 %gep1 = getelementptr [65536 x i32], [65536 x i32]* %struct, i64 0, i32 20001 %cmp = icmp slt i32 %phi, %n