Index: llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -273,6 +273,8 @@
   /// new copy.
   Register narrowExtendRegIfNeeded(Register ExtReg,
                                              MachineIRBuilder &MIB) const;
+  Register widenIfNeeded(Register Reg, unsigned Size,
+                         MachineIRBuilder &MIB) const;
   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
 
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
@@ -1124,26 +1126,25 @@
 MachineInstr *AArch64InstructionSelector::emitTestBit(
     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
     MachineIRBuilder &MIB) const {
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-#ifndef NDEBUG
+  assert(TestReg.isValid());
   assert(ProduceNonFlagSettingCondBr &&
          "Cannot emit TB(N)Z with speculation tracking!");
-  assert(TestReg.isValid());
-  LLT Ty = MRI.getType(TestReg);
-  unsigned Size = Ty.getSizeInBits();
-  assert(Bit < Size &&
-         "Bit to test must be smaler than the size of a test register!");
-  assert(Ty.isScalar() && "Expected a scalar!");
-  assert(Size >= 32 && "Expected at least a 32-bit register!");
-#endif
+  MachineRegisterInfo &MRI = *MIB.getMRI();
 
   // Attempt to optimize the test bit by walking over instructions.
   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
-  bool UseWReg = Bit < 32;
+  LLT Ty = MRI.getType(TestReg);
+  unsigned Size = Ty.getSizeInBits();
+  assert(!Ty.isVector() && "Expected a scalar!");
+  assert(Bit < 64 && "Bit is too large!");
 
   // When the test register is a 64-bit register, we have to narrow to make
   // TBNZW work.
-  if (UseWReg)
+  bool UseWReg = Bit < 32;
+  unsigned NecessarySize = UseWReg ? 32 : 64;
+  if (Size < NecessarySize)
+    TestReg = widenIfNeeded(TestReg, NecessarySize, MIB);
+  else if (Size > NecessarySize)
     TestReg = narrowExtendRegIfNeeded(TestReg, MIB);
 
   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
@@ -5154,6 +5155,53 @@
   return Copy.getReg(0);
 }
 
+Register
+AArch64InstructionSelector::widenIfNeeded(Register Reg, unsigned WideSize,
+                                          MachineIRBuilder &MIB) const {
+  assert(WideSize >= 8 && "WideSize is smaller than all possible registers?");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NarrowSize = MRI.getType(Reg).getSizeInBits();
+  assert(WideSize >= NarrowSize &&
+         "WideSize cannot be smaller than NarrowSize!");
+
+  // If the sizes match, just return the register.
+  //
+  // If NarrowSize is an s1, then we can select it to any size, so we'll treat
+  // it as a don't care.
+  if (NarrowSize == WideSize || NarrowSize == 1)
+    return Reg;
+
+  // Now check the register classes.
+  const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
+  const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize);
+  const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize);
+  assert(OrigRC && "Could not determine narrow RC?");
+  assert(WideRC && "Could not determine wide RC?");
+
+  // If the sizes differ, but the register classes are the same, there is no
+  // need to insert a SUBREG_TO_REG.
+  //
+  // For example, an s8 that's supposed to be a GPR will be selected to either
+  // a GPR32 or a GPR64 register. Note that this assumes that the s8 will
+  // always end up on a GPR32.
+  if (OrigRC == WideRC)
+    return Reg;
+
+  // We have two different register classes. Insert a SUBREG_TO_REG.
+  unsigned SubReg = 0;
+  getSubRegForClass(OrigRC, TRI, SubReg);
+  assert(SubReg && "Couldn't determine subregister?");
+
+  // Build the SUBREG_TO_REG and return the new, widened register.
+  auto SubRegToReg =
+      MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {})
+          .addImm(0)
+          .addUse(Reg)
+          .addImm(SubReg);
+  constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI);
+  return SubRegToReg->getOperand(0).getReg();
+}
+
 /// Select an "extended register" operand. This operand folds in an extend
 /// followed by an optional left shift.
 InstructionSelector::ComplexRendererFns
Index: llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
===================================================================
--- llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
+++ llvm/test/CodeGen/AArch64/GlobalISel/opt-fold-ext-tbz-tbnz.mir
@@ -78,9 +78,8 @@
   ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
   ; CHECK:   liveins: $h0
   ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, $h0, %subreg.hsub
-  ; CHECK:   %copy:gpr32all = COPY [[SUBREG_TO_REG]]
-  ; CHECK:   [[COPY:%[0-9]+]]:gpr32 = COPY %copy
-  ; CHECK:   TBNZW [[COPY]], 3, %bb.1
+  ; CHECK:   %copy:gpr32 = COPY [[SUBREG_TO_REG]]
+  ; CHECK:   TBNZW %copy, 3, %bb.1
   ; CHECK:   B %bb.0
   ; CHECK: bb.1:
   ; CHECK:   RET_ReallyLR
Index: llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/GlobalISel/widen-narrow-tbz-tbnz.mir
@@ -0,0 +1,193 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test widening and narrowing on test bit operations using subregister copies
+# or SUBREG_TO_REG.
+--- |
+ @glob = external unnamed_addr global i1, align 4
+ define void @s1_no_copy() { ret void }
+ define void @s16_no_copy() { ret void }
+ define void @p0_no_copy() { ret void }
+ define void @widen_s32_to_s64() { ret void }
+ define void @widen_s16_to_s64() { ret void }
+ define void @narrow_s64_to_s32() { ret void }
+
+...
+---
+name:            s1_no_copy
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: s1_no_copy
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %narrow:gpr32 = IMPLICIT_DEF
+  ; CHECK:   TBNZW %narrow, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %narrow:gpr(s1) = G_IMPLICIT_DEF
+
+    ; There should be no copy here, because the s1 can be selected to a GPR32.
+    G_BRCOND %narrow(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            s16_no_copy
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: s16_no_copy
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %narrow:gpr32 = IMPLICIT_DEF
+  ; CHECK:   TBNZW %narrow, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %narrow:gpr(s16) = G_IMPLICIT_DEF
+    %trunc:gpr(s1) = G_TRUNC %narrow(s16)
+
+    ; Look through the G_TRUNC to get the G_IMPLICIT_DEF. We don't need a
+    ; SUBREG_TO_REG here, because the s16 will end up on a 32-bit register.
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            p0_no_copy
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: p0_no_copy
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %glob:gpr64common = MOVaddr target-flags(aarch64-page) @glob, target-flags(aarch64-pageoff, aarch64-nc) @glob
+  ; CHECK:   %load:gpr32 = LDRBBui %glob, 0 :: (dereferenceable load 1 from @glob, align 4)
+  ; CHECK:   TBNZW %load, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %glob:gpr(p0) = G_GLOBAL_VALUE @glob
+    %load:gpr(s8) = G_LOAD %glob(p0) :: (dereferenceable load 1 from @glob, align 4)
+    %trunc:gpr(s1) = G_TRUNC %load(s8)
+
+    ; Look through G_TRUNC to get the load. The load is into a s8, which will
+    ; be selected to a GPR32, so we don't need a copy.
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            widen_s32_to_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: widen_s32_to_s64
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   liveins: $w0
+  ; CHECK:   %reg:gpr32all = COPY $w0
+  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32
+  ; CHECK:   TBZX [[SUBREG_TO_REG]], 33, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $w0
+    %reg:gpr(s32) = COPY $w0
+    %zext:gpr(s64) = G_ZEXT %reg(s32)
+    %bit:gpr(s64) = G_CONSTANT i64 8589934592
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %zext, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
+
+    ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get
+    ; bit 33. The subregister should be sub_32.
+    %trunc:gpr(s1) = G_TRUNC %cmp(s32)
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            widen_s16_to_s64
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: widen_s16_to_s64
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   %reg:gpr32 = IMPLICIT_DEF
+  ; CHECK:   [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %reg, %subreg.sub_32
+  ; CHECK:   TBZX [[SUBREG_TO_REG]], 33, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    %reg:gpr(s16) = G_IMPLICIT_DEF
+    %zext:gpr(s64) = G_ZEXT %reg(s16)
+    %bit:gpr(s64) = G_CONSTANT i64 8589934592
+    %zero:gpr(s64) = G_CONSTANT i64 0
+    %and:gpr(s64) = G_AND %zext, %bit
+    %cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
+
+    ; We should widen using a SUBREG_TO_REG here, because we need a TBZX to get
+    ; bit 33. The subregister should be sub_32, because s16 will end up on a
+    ; GPR32.
+    %trunc:gpr(s1) = G_TRUNC %cmp(s32)
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR
+...
+---
+name:            narrow_s64_to_s32
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: narrow_s64_to_s32
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.0(0x40000000), %bb.1(0x40000000)
+  ; CHECK:   liveins: $x0
+  ; CHECK:   %wide:gpr64 = COPY $x0
+  ; CHECK:   %trunc:gpr32 = COPY %wide.sub_32
+  ; CHECK:   TBNZW %trunc, 0, %bb.1
+  ; CHECK:   B %bb.0
+  ; CHECK: bb.1:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    successors: %bb.0, %bb.1
+    liveins: $x0
+    %wide:gpr(s64) = COPY $x0
+
+    ; We should narrow using a subregister copy here.
+    %trunc:gpr(s1) = G_TRUNC %wide(s64)
+    G_BRCOND %trunc(s1), %bb.1
+    G_BR %bb.0
+  bb.1:
+    RET_ReallyLR