diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -825,9 +825,17 @@
 
     Reg = N.getOperand(0);
 
-    // Don't match if free 32-bit -> 64-bit zext can be used instead.
-    if (Ext == AArch64_AM::UXTW &&
-        Reg->getValueType(0).getSizeInBits() == 32 && isDef32(*Reg.getNode()))
+    // Don't match if free 32-bit -> 64-bit zext can be used instead. Use the
+    // isDef32 as a heuristic for when the operand is likely to be a 32bit def.
+    auto isDef32 = [](SDValue N) {
+      unsigned Opc = N.getOpcode();
+      return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
+             Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
+             Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
+             Opc != ISD::FREEZE;
+    };
+    if (Ext == AArch64_AM::UXTW && Reg->getValueType(0).getSizeInBits() == 32 &&
+        isDef32(Reg))
       return false;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -457,23 +457,6 @@
 
 } // end namespace AArch64ISD
 
-namespace {
-
-// Any instruction that defines a 32-bit result zeros out the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
-// 32 bits, they're probably just qualifying a CopyFromReg.
-static inline bool isDef32(const SDNode &N) {
-  unsigned Opc = N.getOpcode();
-  return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
-         Opc != ISD::CopyFromReg && Opc != ISD::AssertSext &&
-         Opc != ISD::AssertZext && Opc != ISD::AssertAlign &&
-         Opc != ISD::FREEZE;
-}
-
-} // end anonymous namespace
-
 namespace AArch64 {
 /// Possible values of current rounding mode, which is specified in bits
 /// 23:22 of FPCR.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7257,14 +7257,6 @@
 //----------------------------------------------------------------------------
 // FIXME: Like for X86, these should go in their own separate .td file.
 
-def def32 : PatLeaf<(i32 GPR32:$src), [{
-  return isDef32(*N);
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
 // For an anyext, we don't care what the high bits are, so we can perform an
 // INSERT_SUBREF into an IMPLICIT_DEF.
 def : Pat<(i64 (anyext GPR32:$src)),
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -221,7 +221,30 @@
   // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
   // real AArch64 instruction and if it is not, do not process the opcode
   // conservatively.
-  if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+  if (SrcMI->getOpcode() == TargetOpcode::COPY &&
+      SrcMI->getOperand(1).getReg().isVirtual()) {
+    const TargetRegisterClass *RC =
+        MRI->getRegClass(SrcMI->getOperand(1).getReg());
+
+    // A COPY from an FPR will become a FMOVSWr, so do so now so that we know
+    // that the upper bits are zero.
+    if (RC != &AArch64::FPR32RegClass &&
+        ((RC != &AArch64::FPR64RegClass && RC != &AArch64::FPR128RegClass) ||
+         SrcMI->getOperand(1).getSubReg() != AArch64::ssub))
+      return false;
+    Register CpySrc = SrcMI->getOperand(1).getReg();
+    if (SrcMI->getOperand(1).getSubReg() == AArch64::ssub) {
+      CpySrc = MRI->createVirtualRegister(&AArch64::FPR32RegClass);
+      BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), CpySrc)
+          .add(SrcMI->getOperand(1));
+    }
+    BuildMI(*SrcMI->getParent(), SrcMI, SrcMI->getDebugLoc(),
+            TII->get(AArch64::FMOVSWr), SrcMI->getOperand(0).getReg())
+        .addReg(CpySrc);
+    ToBeRemoved.insert(SrcMI);
+  }
+  else if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
     return false;
 
   Register DefReg = MI.getOperand(0).getReg();
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -35,8 +35,8 @@
 ; CHECK-LABEL: cnt32_advsimd_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    cnt.8b v0, v0
 ; CHECK-NEXT:    uaddlv.8b h0, v0
 ; CHECK-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll
--- a/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -328,7 +328,8 @@
 define i64 @sign_extend_inreg_isdef32(i64) {
 ; CHECK-LABEL: sign_extend_inreg_isdef32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    sbfx x0, x0, #32, #16
+; CHECK-NEXT:    sbfx x8, x0, #32, #16
+; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    ret
   %2 = lshr i64 %0, 32
   %3 = shl i64 %2, 16
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -246,7 +246,6 @@
 ; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
 ; CHECK-GISEL-NEXT:    fmov w9, s0
-; CHECK-GISEL-NEXT:    mov w9, w9
 ; CHECK-GISEL-NEXT:    str x9, [x8]
 ; CHECK-GISEL-NEXT:    ret
   %val0_tmp = load i64, i64* @var64