Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -558,6 +558,9 @@ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitExpandedAND(MachineInstr &MI, MachineBasicBlock *BB, + unsigned RegSize) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2256,6 +2256,88 @@ return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitExpandedAND(MachineInstr &MI, MachineBasicBlock *BB, + unsigned RegSize) const { + // Try below transformation. + // + // MOVi32imm + ANDWrr + // ==> + // ANDWri + ANDWri + // + // The MOVi32imm pseudo instruction could be expanded to multiple mov + // instructions later. Let's try to split the constant operand of MOVi32imm + // into two bitmask immediates. It makes only two AND instructions intead of + // multiple mov + and instructions. + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineInstr *DefMI = MRI.getUniqueVRegDef(MI.getOperand(2).getReg()); + if (DefMI->getOpcode() != AArch64::MOVi32imm) + return BB; + + uint64_t OrgImm = DefMI->getOperand(1).getImm(); + if (AArch64_AM::isLogicalImmediate(OrgImm, RegSize)) + return BB; + + uint64_t Mask = 0xFFFFULL; + for (unsigned i = 0; i < 4; i++) { + Mask <<= i * 32; + // This immediate can be suitable for single MOV instruction. + if ((OrgImm & Mask) == OrgImm) + return BB; + } + + Mask = 0xFFFFULL; + uint64_t OrgNImm = ~OrgImm; + for (unsigned i = 0; i < 4; i++) { + Mask <<= i * 32; + // This immediate can be suitable for single MOV instruction. + if ((OrgNImm & Mask) == OrgNImm) + return BB; + } + + // The bitmask immediate consists of consecutive ones. Let's say there is + // constant 0b00000000001000000000010000000000 which does not consist of + // consecutive ones. We can split it in to two bitmask immediate like + // 0b00000000001111111111110000000000 and 0b11111111111000000000011111111111. + // If we do AND with these two bitmask immediate, we can see original one. + unsigned LowestBitSet = countTrailingZeros(OrgImm); + unsigned HighestBitSet = Log2_64(OrgImm); + + // Creat a mask which is filled with one from the position of lowest bit set + // to the position of highest bit set. + unsigned NewImm1 = (2U << HighestBitSet) - (1U << LowestBitSet); + // Creat a mask which is filled with one outside the position of lowest bit + // set and the position of highest bit set. + unsigned NewImm2 = OrgImm | ~NewImm1; + + // If the splitted value is not valid bitmask immediate, do not split this + // node. + if (!AArch64_AM::isLogicalImmediate(NewImm2, RegSize)) + return BB; + + // Create bitwise and MIs splitted bitmask immediates. + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register NewTmpReg = MRI.createVirtualRegister(MRI.getRegClass(DstReg)); + uint64_t NewImm1Enc = AArch64_AM::encodeLogicalImmediate(NewImm1, RegSize); + uint64_t NewImm2Enc = AArch64_AM::encodeLogicalImmediate(NewImm2, RegSize); + unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri; + + BuildMI(*BB, MI, DL, TII->get(Opcode), NewTmpReg) + .addReg(SrcReg) + .addImm(NewImm1Enc); + + BuildMI(*BB, MI, DL, TII->get(Opcode), DstReg) + .addReg(NewTmpReg) + .addImm(NewImm2Enc); + + MI.eraseFromParent(); + + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2275,6 +2357,10 @@ case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); + case AArch64::ANDWrr: + return EmitExpandedAND(MI, BB, 32); + case AArch64::ANDXrr: + return EmitExpandedAND(MI, BB, 64); } } Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -2916,7 +2916,9 @@ // Split from LogicalImm as not all instructions have both. multiclass LogicalReg opc, bit N, string mnemonic, SDPatternOperator OpNode> { - let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + let isReMaterializable = 1, isAsCheapAsAMove = 1, + // Try to expand AND after instruction selection. + usesCustomInserter = !cond(!eq(mnemonic, "and") : true, 1 : false) in { def Wrr : BaseLogicalRegPseudo; def Xrr : BaseLogicalRegPseudo; } Index: llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-split-and-bitmask-immediate.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +define i8 @test(i32 %a) { +; CHECK-LABEL: test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0x3ffc00 +; CHECK-NEXT: and w8, w8, #0xffe007ff +; CHECK-NEXT: cmp w8, #1024 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %and = and i32 %a, 2098176 + %cmp = icmp eq i32 %and, 1024 + %conv = zext i1 %cmp to i8 + ret i8 %conv +} Index: llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll =================================================================== --- llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll +++ llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll @@ -245,10 +245,9 @@ define i32 @n0_badconstmask(i32 %x, i32 %y) { ; CHECK-LABEL: n0_badconstmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #256 -; CHECK-NEXT: movk w9, #65280, lsl #16 +; CHECK-NEXT: and w9, w1, #0xffffff00 ; CHECK-NEXT: and w8, w0, #0xffff00 -; CHECK-NEXT: and w9, w1, w9 +; CHECK-NEXT: and w9, w9, #0xff0001ff ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %mx = and i32 %x, 16776960