diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -11,12 +11,19 @@ // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri // MOVi64imm + ANDXrr ==> ANDXri + ANDXri // +// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi +// MOVi64imm + ADDXrr ==> ANDXri + ANDXri +// +// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi +// MOVi64imm + SUBXrr ==> SUBXri + SUBXri +// // The mov pseudo instruction could be expanded to multiple mov instructions // later. In this case, we could try to split the constant operand of mov -// instruction into two bitmask immediates. It makes two AND instructions -// intead of multiple `mov` + `and` instructions. +// instruction into two immediates which can be directly encoded into +// *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of +// multiple `mov` + `and/add/sub` instructions. // -// 2. Remove redundant ORRWrs which is generated by zero-extend. +// 4. Remove redundant ORRWrs which is generated by zero-extend. // // %3:gpr32 = ORRWrs $wzr, %2, 0 // %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32 @@ -51,6 +58,12 @@ MachineLoopInfo *MLI; MachineRegisterInfo *MRI; + bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI, + MachineInstr *&SubregToRegMI); + + template + bool visitADDSUB(MachineInstr &MI, + SmallSetVector &ToBeRemoved, bool IsAdd); template bool visitAND(MachineInstr &MI, SmallSetVector &ToBeRemoved); @@ -131,36 +144,9 @@ assert((RegSize == 32 || RegSize == 64) && "Invalid RegSize for AND bitmask peephole optimization"); - // Check whether AND's MBB is in loop and the AND is loop invariant. - MachineBasicBlock *MBB = MI.getParent(); - MachineLoop *L = MLI->getLoopFor(MBB); - if (L && !L->isLoopInvariant(MI)) - return false; - - // Check whether AND's operand is MOV with immediate. - MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); - if (!MovMI) - return false; - - MachineInstr *SubregToRegMI = nullptr; - // If it is SUBREG_TO_REG, check its operand. - if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { - SubregToRegMI = MovMI; - MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); - if (!MovMI) - return false; - } - - if (MovMI->getOpcode() != AArch64::MOVi32imm && - MovMI->getOpcode() != AArch64::MOVi64imm) - return false; - - // If the MOV has multiple uses, do not split the immediate because it causes - // more instructions. - if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) - return false; - - if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) + // Perform several essential checks against current MI. + MachineInstr *MovMI = nullptr, *SubregToRegMI = nullptr; + if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) return false; // Split the bitmask immediate into two. @@ -177,6 +163,7 @@ // Create new AND MIs. DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); const TargetRegisterClass *ANDImmRC = (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass; Register DstReg = MI.getOperand(0).getReg(); @@ -251,6 +238,145 @@ return true; } +template +static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) { + // The immediate must be in the form of ((imm0 << 12) + imm1), in which both + // imm0 and imm1 are non-zero 12-bit unsigned int. + if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 || + (Imm & ~static_cast(0xffffff)) != 0) + return false; + + // The immediate can not be composed via a single instruction. + SmallVector Insn; + AArch64_IMM::expandMOVImm(Imm, RegSize, Insn); + if (Insn.size() == 1) + return false; + + // Split Imm into (Imm0 << 12) + Imm1; + Imm0 = (Imm >> 12) & 0xfff; + Imm1 = Imm & 0xfff; + return true; +} + +template +bool AArch64MIPeepholeOpt::visitADDSUB( + MachineInstr &MI, SmallSetVector &ToBeRemoved, + bool IsAdd) { + // Try below transformation. + // + // MOVi32imm + ADDWrr ==> ADDWri + ADDWri + // MOVi64imm + ADDXrr ==> ADDXri + ADDXri + // + // MOVi32imm + SUBWrr ==> SUBWri + SUBWri + // MOVi64imm + SUBXrr ==> SUBXri + SUBXri + // + // The mov pseudo instruction could be expanded to multiple mov instructions + // later. Let's try to split the constant operand of mov instruction into two + // legal add/sub immediates. It makes only two ADD/SUB instructions intead of + // multiple `mov` + `and/sub` instructions. + + unsigned RegSize = sizeof(T) * 8; + assert((RegSize == 32 || RegSize == 64) && + "Invalid RegSize for legal add/sub immediate peephole optimization"); + + // Perform several essential checks against current MI. + MachineInstr *MovMI, *SubregToRegMI; + if (!checkMovImmInstr(MI, MovMI, SubregToRegMI)) + return false; + + // Split the immediate to Imm0 and Imm1, and calculate the Opcode. + T Imm = static_cast(MovMI->getOperand(1).getImm()), Imm0, Imm1; + unsigned Opcode; + if (splitAddSubImm(Imm, RegSize, Imm0, Imm1)) { + if (IsAdd) + Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri; + else + Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri; + } else if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1)) { + if (IsAdd) + Opcode = RegSize == 32 ? AArch64::SUBWri : AArch64::SUBXri; + else + Opcode = RegSize == 32 ? AArch64::ADDWri : AArch64::ADDXri; + } else { + return false; + } + + // Create new ADD/SUB MIs. + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *RC = (RegSize == 32) + ? &AArch64::GPR32commonRegClass + : &AArch64::GPR64commonRegClass; + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register NewTmpReg = MRI->createVirtualRegister(RC); + Register NewDstReg = MRI->createVirtualRegister(RC); + + MRI->constrainRegClass(SrcReg, RC); + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + .addReg(SrcReg) + .addImm(Imm0) + .addImm(12); + + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg) + .addReg(NewTmpReg) + .addImm(Imm1) + .addImm(0); + + MRI->replaceRegWith(DstReg, NewDstReg); + // replaceRegWith changes MI's definition register. Keep it for SSA form until + // deleting MI. + MI.getOperand(0).setReg(DstReg); + + // Record the MIs need to be removed. + ToBeRemoved.insert(&MI); + if (SubregToRegMI) + ToBeRemoved.insert(SubregToRegMI); + ToBeRemoved.insert(MovMI); + + return true; +} + +// Checks if the corresponding MOV immediate instruction is applicable for +// this peephole optimization. +bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI, + MachineInstr *&MovMI, + MachineInstr *&SubregToRegMI) { + // Check whether current MBB is in loop and the AND is loop invariant. + MachineBasicBlock *MBB = MI.getParent(); + MachineLoop *L = MLI->getLoopFor(MBB); + if (L && !L->isLoopInvariant(MI)) + return false; + + // Check whether current MI's operand is MOV with immediate. + MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + if (!MovMI) + return false; + + // If it is SUBREG_TO_REG, check its operand. + SubregToRegMI = nullptr; + if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { + SubregToRegMI = MovMI; + MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg()); + if (!MovMI) + return false; + } + + if (MovMI->getOpcode() != AArch64::MOVi32imm && + MovMI->getOpcode() != AArch64::MOVi64imm) + return false; + + // If the MOV has multiple uses, do not split the immediate because it causes + // more instructions. + if (!MRI->hasOneUse(MovMI->getOperand(0).getReg())) + return false; + if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) + return false; + + // It is OK to perform this peephole optimization. + return true; +} + bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -278,6 +404,18 @@ case AArch64::ORRWrs: Changed = visitORR(MI, ToBeRemoved); break; + case AArch64::ADDWrr: + Changed = visitADDSUB(MI, ToBeRemoved, true); + break; + case AArch64::SUBWrr: + Changed = visitADDSUB(MI, ToBeRemoved, false); + break; + case AArch64::ADDXrr: + Changed = visitADDSUB(MI, ToBeRemoved, true); + break; + case AArch64::SUBXrr: + Changed = visitADDSUB(MI, ToBeRemoved, false); + break; } } } diff --git a/llvm/test/CodeGen/AArch64/addsub-24bit-imm.mir b/llvm/test/CodeGen/AArch64/addsub-24bit-imm.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/addsub-24bit-imm.mir @@ -0,0 +1,63 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-mi-peephole-opt -o - -mtriple=aarch64-unknown-linux -verify-machineinstrs %s | FileCheck %s + +# Main intention is to verify machine instructions have valid register classes. +# Use of UBFM[W|X]ri is used as an arbitrary instruction that requires GPR[32|64]RegClass. +# If the ADD/SUB optimization generates invalid register classes, this test will fail. +--- +name: addi +body: | + bb.0.entry: + liveins: $w0 + ; CHECK-LABEL: name: addi + ; CHECK: [[COPY:%[0-9]+]]:gpr32common = COPY $w0 + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32common = ADDWri [[COPY]], 273, 12 + ; CHECK-NEXT: [[ADDWri1:%[0-9]+]]:gpr32common = ADDWri [[ADDWri]], 3549, 0 + ; CHECK-NEXT: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[ADDWri1]], 28, 31 + ; CHECK-NEXT: $w0 = COPY [[UBFMWri]] + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:gpr32 = COPY $w0 + %1:gpr32 = MOVi32imm 1121757 + %2:gpr32 = ADDWrr %0, %1 + %3:gpr32 = UBFMWri %2, 28, 31 + $w0 = COPY %3 + RET_ReallyLR implicit $w0 +... +--- +name: addl +body: | + bb.0.entry: + liveins: $x0 + ; CHECK-LABEL: name: addl + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64common = ADDXri [[COPY]], 273, 12 + ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64common = ADDXri [[ADDXri]], 3549, 0 + ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[ADDXri1]], 28, 31 + ; CHECK-NEXT: $x0 = COPY [[UBFMXri]] + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:gpr64 = COPY $x0 + %1:gpr32 = MOVi32imm 1121757 + %2:gpr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32 + %3:gpr64 = ADDXrr %0, killed %2 + %4:gpr64 = UBFMXri %3, 28, 31 + $x0 = COPY %4 + RET_ReallyLR implicit $x0 +... +--- +name: addl_negate +body: | + bb.0.entry: + liveins: $x0 + ; CHECK-LABEL: name: addl_negate + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[SUBXri:%[0-9]+]]:gpr64common = SUBXri [[COPY]], 273, 12 + ; CHECK-NEXT: [[SUBXri1:%[0-9]+]]:gpr64common = SUBXri [[SUBXri]], 3549, 0 + ; CHECK-NEXT: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[SUBXri1]], 28, 31 + ; CHECK-NEXT: $x0 = COPY [[UBFMXri]] + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:gpr64 = COPY $x0 + %1:gpr64 = MOVi64imm -1121757 + %2:gpr64 = ADDXrr %0, killed %1 + %3:gpr64 = UBFMXri %2, 28, 31 + $x0 = COPY %3 + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll --- a/llvm/test/CodeGen/AArch64/addsub.ll +++ b/llvm/test/CodeGen/AArch64/addsub.ll @@ -152,9 +152,8 @@ define i64 @add_two_parts_imm_i64(i64 %a) { ; CHECK-LABEL: add_two_parts_imm_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: add x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add x0, x8, #1365 ; CHECK-NEXT: ret %b = add i64 %a, 11183445 ret i64 %b @@ -163,9 +162,8 @@ define i32 @add_two_parts_imm_i32(i32 %a) { ; CHECK-LABEL: add_two_parts_imm_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: add w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add w0, w8, #1365 ; CHECK-NEXT: ret %b = add i32 %a, 11183445 ret i32 %b @@ -174,9 +172,8 @@ define i64 @add_two_parts_imm_i64_neg(i64 %a) { ; CHECK-LABEL: add_two_parts_imm_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-42325 -; CHECK-NEXT: movk x8, #65365, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: sub x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub x0, x8, #1365 ; CHECK-NEXT: ret %b = add i64 %a, -11183445 ret i64 %b @@ -185,9 +182,8 @@ define i32 @add_two_parts_imm_i32_neg(i32 %a) { ; CHECK-LABEL: add_two_parts_imm_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23211 -; CHECK-NEXT: movk w8, #65365, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: sub w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub w0, w8, #1365 ; CHECK-NEXT: ret %b = add i32 %a, -11183445 ret i32 %b @@ -196,9 +192,8 @@ define i64 @sub_two_parts_imm_i64(i64 %a) { ; CHECK-LABEL: sub_two_parts_imm_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-42325 -; CHECK-NEXT: movk x8, #65365, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: sub x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub x0, x8, #1365 ; CHECK-NEXT: ret %b = sub i64 %a, 11183445 ret i64 %b @@ -207,9 +202,8 @@ define i32 @sub_two_parts_imm_i32(i32 %a) { ; CHECK-LABEL: sub_two_parts_imm_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23211 -; CHECK-NEXT: movk w8, #65365, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: sub w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: sub w0, w8, #1365 ; CHECK-NEXT: ret %b = sub i32 %a, 11183445 ret i32 %b @@ -218,9 +212,8 @@ define i64 @sub_two_parts_imm_i64_neg(i64 %a) { ; CHECK-LABEL: sub_two_parts_imm_i64_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add x0, x0, x8 +; CHECK-NEXT: add x8, x0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add x0, x8, #1365 ; CHECK-NEXT: ret %b = sub i64 %a, -11183445 ret i64 %b @@ -229,14 +222,57 @@ define i32 @sub_two_parts_imm_i32_neg(i32 %a) { ; CHECK-LABEL: sub_two_parts_imm_i32_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42325 -; CHECK-NEXT: movk w8, #170, lsl #16 -; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: add w8, w0, #2730, lsl #12 // =11182080 +; CHECK-NEXT: add w0, w8, #1365 ; CHECK-NEXT: ret %b = sub i32 %a, -11183445 ret i32 %b } +define i32 @add_27962026(i32 %a) { +; CHECK-LABEL: add_27962026: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #43690 +; CHECK-NEXT: movk w8, #426, lsl #16 +; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: ret + %b = add i32 %a, 27962026 + ret i32 %b +} + +define i32 @add_65534(i32 %a) { +; CHECK-LABEL: add_65534: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #65534 +; CHECK-NEXT: add w0, w0, w8 +; CHECK-NEXT: ret + %b = add i32 %a, 65534 + ret i32 %b +} + +declare i32 @foox(i32) + +define void @add_in_loop(i32 %0) { +; CHECK-LABEL: add_in_loop: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov w19, #43690 +; CHECK-NEXT: movk w19, #170, lsl #16 +; CHECK-NEXT: .LBB15_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add w0, w0, w19 +; CHECK-NEXT: bl foox +; CHECK-NEXT: b .LBB15_1 + br label %2 +2: + %3 = phi i32 [ %0, %1 ], [ %5, %2 ] + %4 = add nsw i32 %3, 11184810 + %5 = tail call i32 @foox(i32 %4) #2 + br label %2 +} + define void @testing() { ; CHECK-LABEL: testing: ; CHECK: // %bb.0: @@ -244,7 +280,7 @@ ; CHECK-NEXT: ldr x8, [x8, :got_lo12:var_i32] ; CHECK-NEXT: ldr w9, [x8] ; CHECK-NEXT: cmp w9, #4095 -; CHECK-NEXT: b.ne .LBB13_6 +; CHECK-NEXT: b.ne .LBB16_6 ; CHECK-NEXT: // %bb.1: // %test2 ; CHECK-NEXT: adrp x10, :got:var2_i32 ; CHECK-NEXT: add w11, w9, #1 @@ -252,26 +288,26 @@ ; CHECK-NEXT: str w11, [x8] ; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w10, #3567, lsl #12 // =14610432 -; CHECK-NEXT: b.lo .LBB13_6 +; CHECK-NEXT: b.lo .LBB16_6 ; CHECK-NEXT: // %bb.2: // %test3 ; CHECK-NEXT: add w11, w9, #2 ; CHECK-NEXT: cmp w9, #123 ; CHECK-NEXT: str w11, [x8] -; CHECK-NEXT: b.lt .LBB13_6 +; CHECK-NEXT: b.lt .LBB16_6 ; CHECK-NEXT: // %bb.3: // %test4 ; CHECK-NEXT: add w11, w9, #3 ; CHECK-NEXT: cmp w10, #321 ; CHECK-NEXT: str w11, [x8] -; CHECK-NEXT: b.gt .LBB13_6 +; CHECK-NEXT: b.gt .LBB16_6 ; CHECK-NEXT: // %bb.4: // %test5 ; CHECK-NEXT: add w11, w9, #4 ; CHECK-NEXT: cmn w10, #443 ; CHECK-NEXT: str w11, [x8] -; CHECK-NEXT: b.ge .LBB13_6 +; CHECK-NEXT: b.ge .LBB16_6 ; CHECK-NEXT: // %bb.5: // %test6 ; CHECK-NEXT: add w9, w9, #5 ; CHECK-NEXT: str w9, [x8] -; CHECK-NEXT: .LBB13_6: // %common.ret +; CHECK-NEXT: .LBB16_6: // %common.ret ; CHECK-NEXT: ret %val = load i32, i32* @var_i32 %val2 = load i32, i32* @var2_i32 diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -214,10 +214,9 @@ ; CHECK-LABEL: test5: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #14464 -; CHECK-NEXT: movk w10, #1, lsl #16 ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: add x9, x9, #19, lsl #12 // =77824 +; CHECK-NEXT: add x9, x9, #2176 ; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body