Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -51,6 +51,7 @@ FunctionPass *createFalkorHWPFFixPass(); FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64BranchTargetsPass(); +FunctionPass *createAArch64MIPeepholeOptPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); @@ -82,6 +83,7 @@ void initializeAArch64SpeculationHardeningPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); +void initializeAArch64MIPeepholeOptPass(PassRegistry &); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -0,0 +1,177 @@ +//===- AArch64MIPeepholeOpt.cpp - AArch64 MI peephole optimization pass ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass performs below peephole optimizations on MIR level. +// +// 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri +// MOVi64imm + ANDXrr ==> ANDXri + ANDXri +// +// The mov pseudo instruction could be expanded to multiple mov instructions +// later. In this case, we could try to split the constant operand of mov +// instruction into two bitmask immediates. It makes two AND instructions +// intead of multiple `mov` + `and` instructions. +//===----------------------------------------------------------------------===// + +#include "AArch64ExpandImm.h" +#include "AArch64InstrInfo.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-mi-peephole-opt" + +namespace { + +struct AArch64MIPeepholeOpt : public MachineFunctionPass { + static char ID; + + AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { + initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); + } + + const AArch64InstrInfo *TII; + MachineLoopInfo *MLI; + MachineRegisterInfo *MRI; + + template + bool visitAND(MachineInstr &MI, SmallVectorImpl &ToBeRemoved); + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "AArch64 MI Peephole Optimization pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +char AArch64MIPeepholeOpt::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(AArch64MIPeepholeOpt, "aarch64-mi-peephole-opt", + "AArch64 MI Peephole Optimization", false, false) + +template +bool AArch64MIPeepholeOpt::visitAND( + MachineInstr &MI, SmallVectorImpl &ToBeRemoved) { + // Try below transformation. + // + // MOVi32imm + ANDWrr ==> ANDWri + ANDWri + // MOVi64imm + ANDXrr ==> ANDXri + ANDXri + // + // The mov pseudo instruction could be expanded to multiple mov instructions + // later. Let's try to split the constant operand of mov instruction into two + // bitmask immediates. It makes only two AND instructions intead of multiple + // mov + and instructions. + + // Check whether AND's MBB is in loop and the AND is loop invariant. + MachineBasicBlock *MBB = MI.getParent(); + MachineLoop *L = MLI->getLoopFor(MBB); + if (L && !L->isLoopInvariant(MI)) + return false; + + // Check whether AND's operand is MOV with immediate. + MachineInstr *DefMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg()); + MachineInstr *SubregToRegMI = nullptr; + // If it is SUBREG_TO_REG, check its operand. + if (DefMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) { + SubregToRegMI = DefMI; + DefMI = MRI->getUniqueVRegDef(DefMI->getOperand(2).getReg()); + } + + if (DefMI->getOpcode() != AArch64::MOVi32imm && + DefMI->getOpcode() != AArch64::MOVi64imm) + return false; + + // If current bitmask immediate is valid, we do not need to split it. + unsigned RegSize = sizeof(T) * 8; + uint64_t Imm = DefMI->getOperand(1).getImm(); + if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) + return false; + + // Check whether the bitmask immedaite can be split. + T UImm = static_cast(Imm); + if (!AArch64_AM::isValidAndSplitBitmaskImm(UImm, RegSize)) + return false; + + // Split the bitmask immeidate into two. + T Imm1Enc = AArch64_AM::splitAndBitmaskImm(UImm, RegSize, true); + T Imm2Enc = AArch64_AM::splitAndBitmaskImm(UImm, RegSize, false); + + // Create new AND MIs. + DebugLoc DL = MI.getDebugLoc(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register NewTmpReg = MRI->createVirtualRegister(MRI->getRegClass(DstReg)); + unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri; + + BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg) + .addReg(SrcReg) + .addImm(Imm1Enc); + + BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg) + .addReg(NewTmpReg) + .addImm(Imm2Enc); + + ToBeRemoved.push_back(&MI); + + // If mov instruction has one use, delete it. + if (MRI->hasOneUse(DefMI->getOperand(0).getReg())) + ToBeRemoved.push_back(DefMI); + + // If SUBREG_TO_REG instruction has one use, delete it. + if (SubregToRegMI && MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg())) + ToBeRemoved.push_back(SubregToRegMI); + + return true; +} + +bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + TII = static_cast(MF.getSubtarget().getInstrInfo()); + MLI = &getAnalysis(); + MRI = &MF.getRegInfo(); + + if (!MRI->isSSA()) + return false; + + bool Changed = false; + SmallVector ToBeRemoved; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + default: + break; + case AArch64::ANDWrr: + Changed = visitAND(MI, ToBeRemoved); + break; + case AArch64::ANDXrr: + Changed = visitAND(MI, ToBeRemoved); + break; + } + } + } + + for (MachineInstr *MI : ToBeRemoved) + MI->eraseFromParent(); + + return Changed; +} + +FunctionPass *llvm::createAArch64MIPeepholeOptPass() { + return new AArch64MIPeepholeOpt(); +} Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -195,6 +195,7 @@ initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); + initializeAArch64MIPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); initializeAArch64PreLegalizerCombinerPass(*PR); @@ -479,6 +480,7 @@ bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; + void addMachineSSAOptimization() override; bool addILPOpts() override; void addPreRegAlloc() override; void addPostRegAlloc() override; @@ -649,6 +651,14 @@ return false; } +void AArch64PassConfig::addMachineSSAOptimization() { + // Run default MachineSSAOptimization first. + TargetPassConfig::addMachineSSAOptimization(); + + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createAArch64MIPeepholeOptPass()); +} + bool AArch64PassConfig::addILPOpts() { if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); Index: llvm/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/lib/Target/AArch64/CMakeLists.txt +++ llvm/lib/Target/AArch64/CMakeLists.txt @@ -66,6 +66,7 @@ AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MacroFusion.cpp + AArch64MIPeepholeOpt.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp AArch64PBQPRegAlloc.cpp Index: llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h =================================================================== --- llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H +#include "AArch64ExpandImm.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/bit.h" @@ -337,6 +338,66 @@ return true; } +template +static inline bool isValidAndSplitBitmaskImm(T Imm, unsigned regSize) { + assert((regSize == 32 || regSize == 64) && + "Invalid regSize for AndsplitBitmaskImm"); + T UImm = static_cast(Imm); + if (isLogicalImmediate(UImm, regSize)) + return false; + + // If this immediate can be handled by one instruction, do not split it. + SmallVector Insn; + AArch64_IMM::expandMOVImm(UImm, regSize, Insn); + if (Insn.size() == 1) + return false; + + // The bitmask immediate consists of consecutive ones. Let's say there is + // constant 0b00000000001000000000010000000000 which does not consist of + // consecutive ones. We can split it in to two bitmask immediate like + // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. + // If we do AND with these two bitmask immediate, we can see original one. + unsigned LowestBitSet = countTrailingZeros(UImm); + unsigned HighestBitSet = Log2_64(UImm); + + // Create a mask which is filled with one from the position of lowest bit set + // to the position of highest bit set. + T NewImm1 = (static_cast(2) << HighestBitSet) - + (static_cast(1) << LowestBitSet); + // Create a mask which is filled with one outside the position of lowest bit + // set and the position of highest bit set. + T NewImm2 = UImm | ~NewImm1; + + // If the splitted value is not valid bitmask immediate, do not split this + // constant. + if (!isLogicalImmediate(NewImm2, regSize)) + return false; + return true; +} + +template +static inline T splitAndBitmaskImm(T Imm, unsigned regSize, bool FirstImm) { + assert((regSize == 32 || regSize == 64) && + "Invalid regSize for AndsplitBitmaskImm"); + + unsigned LowestBitSet = countTrailingZeros(Imm); + unsigned HighestBitSet = Log2_64(Imm); + + // Create a mask which is filled with one from the position of lowest bit set + // to the position of highest bit set. + T Imm1 = (static_cast(2) << HighestBitSet) - + (static_cast(1) << LowestBitSet); + T Imm1Enc = encodeLogicalImmediate(Imm1, regSize); + if (FirstImm) + return Imm1Enc; + + // Create a mask which is filled with one outside the position of lowest bit + // set and the position of highest bit set. + T Imm2 = Imm | ~Imm1; + T Imm2Enc = AArch64_AM::encodeLogicalImmediate(Imm2, regSize); + return Imm2Enc; +} + //===----------------------------------------------------------------------===// // Floating-point Immediates // Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -40,7 +40,7 @@ ; CHECK-NEXT: Induction Variable Users ; CHECK-NEXT: Loop Strength Reduction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) -; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Merge contiguous icmps into a memcmp ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lazy Branch Probability Analysis @@ -131,6 +131,7 @@ ; CHECK-NEXT: Machine code sinking ; CHECK-NEXT: Peephole Optimizations ; CHECK-NEXT: Remove dead machine instructions +; CHECK-NEXT: AArch64 MI Peephole Optimization pass ; CHECK-NEXT: AArch64 Dead register definitions ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: Process Implicit Definitions Index: llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +define i8 @test1(i32 %a) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0x3ffc00 +; CHECK-NEXT: and w8, w8, #0xffe007ff +; CHECK-NEXT: cmp w8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i32 %a, 2098176 + %cmp = icmp eq i32 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +; This constant should not be split because it can be handled by one mov. +define i8 @test2(i32 %a) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #135 +; CHECK-NEXT: and w8, w0, w8 +; CHECK-NEXT: cmp w8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i32 %a, 135 + %cmp = icmp eq i32 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +; This constant should not be split because the split immediate is not valid +; bitmask immediate. +define i8 @test3(i32 %a) { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1024 +; CHECK-NEXT: movk w8, #33, lsl #16 +; CHECK-NEXT: and w8, w0, w8 +; CHECK-NEXT: cmp w8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i32 %a, 2163712 + %cmp = icmp eq i32 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +define i8 @test4(i64 %a) { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and x8, x0, #0x3ffc00 +; CHECK-NEXT: and x8, x8, #0xffffffffffe007ff +; CHECK-NEXT: cmp x8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i64 %a, 2098176 + %cmp = icmp eq i64 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +define i8 @test5(i64 %a) { +; CHECK-LABEL: test5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and x8, x0, #0x3ffffc000 +; CHECK-NEXT: and x8, x8, #0xfffffffe00007fff +; CHECK-NEXT: cmp x8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i64 %a, 8589950976 + %cmp = icmp eq i64 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +; This constant should not be split because it can be handled by one mov. +define i8 @test6(i64 %a) { +; CHECK-LABEL: test6: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #135 +; CHECK-NEXT: and x8, x0, x8 +; CHECK-NEXT: cmp x8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i64 %a, 135 + %cmp = icmp eq i64 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +; This constant should not be split because the split immediate is not valid +; bitmask immediate. +define i8 @test7(i64 %a) { +; CHECK-LABEL: test7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1024 +; CHECK-NEXT: movk w8, #33, lsl #16 +; CHECK-NEXT: and x8, x0, x8 +; CHECK-NEXT: cmp x8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i64 %a, 2163712 + %cmp = icmp eq i64 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} + +; The split bitmask immediates should be hoisted outside loop because they are +; loop invariant. +define void @test8(i64 %a, i64* noalias %src, i64* noalias %dst, i64 %n) { +; CHECK-LABEL: test8: +; CHECK: // %bb.0: // %loop.ph +; CHECK-NEXT: and x9, x0, #0x3ffc00 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: and x9, x9, #0xffffffffffe007ff +; CHECK-NEXT: b .LBB7_2 +; CHECK-NEXT: .LBB7_1: // %for.inc +; CHECK-NEXT: // in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, x3 +; CHECK-NEXT: b.gt .LBB7_4 +; CHECK-NEXT: .LBB7_2: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: b.hs .LBB7_1 +; CHECK-NEXT: // %bb.3: // %if.then +; CHECK-NEXT: // in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: lsl x10, x8, #3 +; CHECK-NEXT: ldr x11, [x1, x10] +; CHECK-NEXT: str x11, [x2, x10] +; CHECK-NEXT: b .LBB7_1 +; CHECK-NEXT: .LBB7_4: // %exit +; CHECK-NEXT: ret +loop.ph: + br label %loop + +loop: + %iv = phi i64 [ %inc, %for.inc ], [ 0, %loop.ph ] + %and = and i64 %a, 2098176 + %cmp = icmp ult i64 %iv, %and + br i1 %cmp, label %if.then, label %if.else + +if.then: + %src.arrayidx = getelementptr inbounds i64, i64* %src, i64 %iv + %val = load i64, i64* %src.arrayidx + %dst.arrayidx = getelementptr inbounds i64, i64* %dst, i64 %iv + store i64 %val, i64* %dst.arrayidx + br label %for.inc + +if.else: + br label %for.inc + +for.inc: + %inc = add nuw nsw i64 %iv, 1 + %cond = icmp sgt i64 %inc, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; This constant should not be split because the `and` is not loop invariant. +define i32 @test9(i32* nocapture %x, i32* nocapture readonly %y, i32 %n) { +; CHECK-LABEL: test9: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w2, #1 +; CHECK-NEXT: b.lt .LBB8_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w9, #1024 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: movk w9, #32, lsl #16 +; CHECK-NEXT: .LBB8_2: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr w10, [x1], #4 +; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: and w10, w10, w9 +; CHECK-NEXT: str w10, [x0], #4 +; CHECK-NEXT: b.ne .LBB8_2 +; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret i32 0 + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %and = and i32 %0, 2098176 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %indvars.iv + store i32 %and, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} Index: llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll =================================================================== --- llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll +++ llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll @@ -245,10 +245,9 @@ define i32 @n0_badconstmask(i32 %x, i32 %y) { ; CHECK-LABEL: n0_badconstmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #256 -; CHECK-NEXT: movk w9, #65280, lsl #16 +; CHECK-NEXT: and w9, w1, #0xffffff00 ; CHECK-NEXT: and w8, w0, #0xffff00 -; CHECK-NEXT: and w9, w1, w9 +; CHECK-NEXT: and w9, w9, #0xff0001ff ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %mx = and i32 %x, 16776960