Index: llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -273,6 +273,8 @@ /// new copy. Register narrowExtendRegIfNeeded(Register ExtReg, MachineIRBuilder &MIB) const; + Register widenIfNeeded(Register Reg, unsigned Size, + MachineIRBuilder &MIB) const; ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, @@ -1124,26 +1126,25 @@ MachineInstr *AArch64InstructionSelector::emitTestBit( Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const { - MachineRegisterInfo &MRI = *MIB.getMRI(); -#ifndef NDEBUG + assert(TestReg.isValid()); assert(ProduceNonFlagSettingCondBr && "Cannot emit TB(N)Z with speculation tracking!"); - assert(TestReg.isValid()); - LLT Ty = MRI.getType(TestReg); - unsigned Size = Ty.getSizeInBits(); - assert(Bit < Size && - "Bit to test must be smaler than the size of a test register!"); - assert(Ty.isScalar() && "Expected a scalar!"); - assert(Size >= 32 && "Expected at least a 32-bit register!"); -#endif + MachineRegisterInfo &MRI = *MIB.getMRI(); // Attempt to optimize the test bit by walking over instructions. TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI); - bool UseWReg = Bit < 32; + LLT Ty = MRI.getType(TestReg); + unsigned Size = Ty.getSizeInBits(); + assert(!Ty.isVector() && "Expected a scalar!"); + assert(Bit < 64 && "Bit is too large!"); // When the test register is a 64-bit register, we have to narrow to make // TBNZW work. - if (UseWReg) + bool UseWReg = Bit < 32; + unsigned NecessarySize = UseWReg ? 32 : 64; + if (Size < NecessarySize) + TestReg = widenIfNeeded(TestReg, NecessarySize, MIB); + else if (Size > NecessarySize) TestReg = narrowExtendRegIfNeeded(TestReg, MIB); static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX}, @@ -5154,6 +5155,53 @@ return Copy.getReg(0); } +Register +AArch64InstructionSelector::widenIfNeeded(Register Reg, unsigned WideSize, + MachineIRBuilder &MIB) const { + assert(WideSize >= 8 && "WideSize is smaller than all possible registers?"); + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NarrowSize = MRI.getType(Reg).getSizeInBits(); + assert(WideSize >= NarrowSize && + "WideSize cannot be smaller than NarrowSize!"); + + // If the sizes match, just return the register. + // + // If NarrowSize is an s1, then we can select it to any size, so we'll treat + // it as a don't care. + if (NarrowSize == WideSize || NarrowSize == 1) + return Reg; + + // Now check the register classes. + const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); + const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize); + const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize); + assert(OrigRC && "Could not determine narrow RC?"); + assert(WideRC && "Could not determine wide RC?"); + + // If the sizes differ, but the register classes are the same, there is no + // need to insert a SUBREG_TO_REG. + // + // For example, an s8 that's supposed to be a GPR will be selected to either + // a GPR32 or a GPR64 register. Note that this assumes that the s8 will + // always end up on a GPR32. + if (OrigRC == WideRC) + return Reg; + + // We have two different register classes. Insert a SUBREG_TO_REG. + unsigned SubReg = 0; + getSubRegForClass(OrigRC, TRI, SubReg); + assert(SubReg && "Couldn't determine subregister?"); + + // Build the SUBREG_TO_REG and return the new, widened register. + auto SubRegToReg = + MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {}) + .addImm(0) + .addUse(Reg) + .addImm(SubReg); + constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI); + return SubRegToReg->getOperand(0).getReg(); +} + /// Select an "extended register" operand. This operand folds in an extend /// followed by an optional left shift. InstructionSelector::ComplexRendererFns Index: llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir @@ -78,9 +78,8 @@ ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) ; CHECK: liveins: $h0 ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, $h0, %subreg.hsub - ; CHECK: %copy:gpr32all = COPY [[SUBREG_TO_REG]] - ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %copy - ; CHECK: TBNZW [[COPY]], 3, %bb.1 + ; CHECK: %copy:gpr32 = COPY [[SUBREG_TO_REG]] + ; CHECK: TBNZW %copy, 3, %bb.1 ; CHECK: B %bb.0 ; CHECK: bb.1: ; CHECK: RET_ReallyLR Index: llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir @@ -0,0 +1,193 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# +# Test widening and narrowing on test bit operations using subregister copies +# or SUBREG_TO_REG. +--- | + @glob = external unnamed_addr global i1, align 4 + define void @s1_no_copy() { ret void } + define void @s16_no_copy() { ret void } + define void @p0_no_copy() { ret void } + define void @widen_s32_to_s64() { ret void } + define void @widen_s16_to_s64() { ret void } + define void @narrow_s64_to_s32() { ret void } + +... +--- +name: s1_no_copy +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: s1_no_copy + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %narrow:gpr32 = IMPLICIT_DEF + ; CHECK: TBNZW %narrow, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %narrow:gpr(s1) = G_IMPLICIT_DEF + + ; There should be no copy here, because the s1 can be selected to a GPR32. + G_BRCOND %narrow(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: s16_no_copy +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: s16_no_copy + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %narrow:gpr32 = IMPLICIT_DEF + ; CHECK: TBNZW %narrow, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %narrow:gpr(s16) = G_IMPLICIT_DEF + %trunc:gpr(s1) = G_TRUNC %narrow(s16) + + ; Look through the G_TRUNC to get the G_IMPLICIT_DEF. We don't need a + ; SUBREG_TO_REG here, because the s16 will end up on a 32-bit register. + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: p0_no_copy +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: p0_no_copy + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %glob:gpr64common = MOVaddr target-flags(aarch64-page) @glob, target-flags(aarch64-pageoff, aarch64-nc) @glob + ; CHECK: %load:gpr32 = LDRBBui %glob, 0 :: (dereferenceable load 1 from @glob, align 4) + ; CHECK: TBNZW %load, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %glob:gpr(p0) = G_GLOBAL_VALUE @glob + %load:gpr(s8) = G_LOAD %glob(p0) :: (dereferenceable load 1 from @glob, align 4) + %trunc:gpr(s1) = G_TRUNC %load(s8) + + ; Look through G_TRUNC to get the load. The load is into a s8, which will + ; be selected to a GPR32, so we don't need a copy. + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: widen_s32_to_s64 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: widen_s32_to_s64 + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $w0 + ; CHECK: %reg:gpr32all = COPY $w0 + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32 + ; CHECK: TBZX [[SUBREG_TO_REG]], 33, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $w0 + %reg:gpr(s32) = COPY $w0 + %zext:gpr(s64) = G_ZEXT %reg(s32) + %bit:gpr(s64) = G_CONSTANT i64 8589934592 + %zero:gpr(s64) = G_CONSTANT i64 0 + %and:gpr(s64) = G_AND %zext, %bit + %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero + + ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get + ; bit 33. The subregister should be sub_32. + %trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: widen_s16_to_s64 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: widen_s16_to_s64 + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: %reg:gpr32 = IMPLICIT_DEF + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32 + ; CHECK: TBZX [[SUBREG_TO_REG]], 33, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + %reg:gpr(s16) = G_IMPLICIT_DEF + %zext:gpr(s64) = G_ZEXT %reg(s16) + %bit:gpr(s64) = G_CONSTANT i64 8589934592 + %zero:gpr(s64) = G_CONSTANT i64 0 + %and:gpr(s64) = G_AND %zext, %bit + %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero + + ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get + ; bit 33. The subregister should be sub_32, because s16 will end up on a + ; GPR32. + %trunc:gpr(s1) = G_TRUNC %cmp(s32) + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR +... +--- +name: narrow_s64_to_s32 +alignment: 4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: narrow_s64_to_s32 + ; CHECK: bb.0: + ; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $x0 + ; CHECK: %wide:gpr64 = COPY $x0 + ; CHECK: %trunc:gpr32 = COPY %wide.sub_32 + ; CHECK: TBNZW %trunc, 0, %bb.1 + ; CHECK: B %bb.0 + ; CHECK: bb.1: + ; CHECK: RET_ReallyLR + bb.0: + successors: %bb.0, %bb.1 + liveins: $x0 + %wide:gpr(s64) = COPY $x0 + + ; We should narrow using a subregister copy here. + %trunc:gpr(s1) = G_TRUNC %wide(s64) + G_BRCOND %trunc(s1), %bb.1 + G_BR %bb.0 + bb.1: + RET_ReallyLR