Index: llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -991,21 +991,61 @@ } /// Return a register which can be used as a bit to test in a TB(N)Z. -static Register getTestBitReg(Register Reg, MachineRegisterInfo &MRI) { +static Register getTestBitReg(Register Reg, uint64_t Bit, + MachineRegisterInfo &MRI) { assert(Reg.isValid() && "Expected valid register!"); while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) { unsigned Opc = MI->getOpcode(); Register NextReg; // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits. - if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT) + if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT) { NextReg = MI->getOperand(1).getReg(); + // Did we find something worth folding? + if (!NextReg.isValid() || !MRI.hasOneUse(NextReg)) + break; - // Did we find something worth folding? - if (!NextReg.isValid() || !MRI.hasOneUse(NextReg)) + // NextReg is worth folding. Keep looking. + Reg = NextReg; + continue; + } + + // Look through binary operations that we might want to try and fold in. + if (MI->getNumOperands() != 3) break; + MachineOperand &LHSOp = MI->getOperand(1); + if (!LHSOp.isReg()) + break; + MachineOperand &RHSOp = MI->getOperand(2); + if (!RHSOp.isReg()) + break; + + // Look for a constant on either side of the operation. + Register TestReg = LHSOp.getReg(); + Register ConstantReg = RHSOp.getReg(); + auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!VRegAndVal) { + std::swap(ConstantReg, TestReg); + VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI); + if (!VRegAndVal) + break; + } + + // We found a constant. Check if it is compatible with the bit. + uint64_t C = static_cast(VRegAndVal->Value); + switch(Opc) { + default: + break; + case TargetOpcode::G_AND: + // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set. + if ((C >> Bit) & 1) + NextReg = TestReg; + break; + } - // NextReg is worth folding. Keep looking. + // Check if we found anything worth folding. + if (!NextReg.isValid() || !MRI.hasOneUse(NextReg)) + break; Reg = NextReg; } return Reg; @@ -1058,7 +1098,7 @@ // Try to optimize the TB(N)Z. uint64_t Bit = Log2_64(static_cast(MaybeBit->Value)); Register TestReg = AndInst->getOperand(1).getReg(); - TestReg = getTestBitReg(TestReg, MRI); + TestReg = getTestBitReg(TestReg, Bit, MRI); // Choose the correct TB(N)Z opcode to use. unsigned Opc = 0; Index: llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-and-tbz-tbnz.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-and-tbz-tbnz.mir @@ -0,0 +1,113 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# +# Check folding an AND into a G_BRCOND which has been matched as a TB(N)Z. +... +--- +name: fold_and_rhs +alignment: 4 +legalized: true +regBankSelected: true +body: | + ; CHECK-LABEL: name: fold_and_rhs + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %copy:gpr64all = COPY $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK: TBNZW [[COPY1]], 3, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $x0 + %copy:gpr(s64) = COPY $x0 + %bit:gpr(s64) = G_CONSTANT i64 8 + %zero:gpr(s64) = G_CONSTANT i64 0 + %fold_cst:gpr(s64) = G_CONSTANT i64 8 + + ; tbnz (and x, 8), 3 == tbnz x, 3 because the third bit of x & 8 is 1 when + ; the third bit of x is 1. + %fold_me:gpr(s64) = G_AND %copy, %fold_cst + + %and:gpr(s64) = G_AND %fold_me, %bit + %cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero + %cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %cmp_trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: fold_and_lhs +alignment: 4 +legalized: true +regBankSelected: true +body: | + ; CHECK-LABEL: name: fold_and_lhs + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %copy:gpr64all = COPY $x0 + ; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK: TBNZW [[COPY1]], 3, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $x0 + %copy:gpr(s64) = COPY $x0 + %bit:gpr(s64) = G_CONSTANT i64 8 + %zero:gpr(s64) = G_CONSTANT i64 0 + %fold_cst:gpr(s64) = G_CONSTANT i64 8 + + ; Same as above, but with the constant on the other side. + %fold_me:gpr(s64) = G_AND %fold_cst, %copy + + %and:gpr(s64) = G_AND %fold_me, %bit + %cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero + %cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %cmp_trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: dont_fold_and +alignment: 4 +legalized: true +regBankSelected: true +body: | + ; CHECK-LABEL: name: dont_fold_and + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %copy:gpr64 = COPY $x0 + ; CHECK: %fold_me:gpr64sp = ANDXri %copy, 4098 + ; CHECK: [[COPY:%[0-9]+]]:gpr64all = COPY %fold_me + ; CHECK: [[COPY1:%[0-9]+]]:gpr32all = COPY [[COPY]].sub_32 + ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[COPY1]] + ; CHECK: TBNZW [[COPY2]], 3, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $x0 + %copy:gpr(s64) = COPY $x0 + %bit:gpr(s64) = G_CONSTANT i64 8 + %zero:gpr(s64) = G_CONSTANT i64 0 + + ; tbnz (and x, 7), 3 != tbnz x, 3, because the third bit of x & 7 is always + ; zero. + %fold_cst:gpr(s64) = G_CONSTANT i64 7 + + %fold_me:gpr(s64) = G_AND %copy, %fold_cst + %and:gpr(s64) = G_AND %fold_me, %bit + %cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero + %cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %cmp_trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR