Index: llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -46,6 +46,21 @@
 //    source, we use the INSvi[X]lane to replace the COPY & INSvi[X]gpr
 //    instructions.
 //
+// 7. If MI sets zero for high 64-bits implicitly, remove `mov 0` for high
+//    64-bits. For example,
+//
+//   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
+//   %2:fpr64 = MOVID 0
+//   %4:fpr128 = IMPLICIT_DEF
+//   %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
+//   %6:fpr128 = IMPLICIT_DEF
+//   %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
+//   %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
+//   ==>
+//   %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
+//   %6:fpr128 = IMPLICIT_DEF
+//   %7:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ExpandImm.h"
@@ -111,6 +126,7 @@
   bool visitORR(MachineInstr &MI);
   bool visitINSERT(MachineInstr &MI);
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
+  bool visitINSvi64lane(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -592,6 +608,71 @@
   return true;
 }
 
+static bool is64bitDefwithZeroHigh64bit(MachineInstr *MI) {
+  bool isSetZeroHigh64bits = false;
+
+  // ToDo: check and add more MIs which set zero for high 64bits.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::FCVTNv4i16:
+  case AArch64::SHRNv8i8_shift:
+    isSetZeroHigh64bits = true;
+    break;
+  }
+
+  return isSetZeroHigh64bits;
+}
+
+bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
+  // Check the MI for low 64-bits sets zero for high 64-bits implicitly.
+  // We are expecting below case.
+  //
+  //  %1:fpr64 = nofpexcept FCVTNv4i16 %0:fpr128, implicit $fpcr
+  //  %6:fpr128 = IMPLICIT_DEF
+  //  %5:fpr128 = INSERT_SUBREG %6:fpr128(tied-def 0), killed %1:fpr64, %subreg.dsub
+  //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
+  MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
+  if (Low64MI->getOpcode() != AArch64::INSERT_SUBREG)
+    return false;
+  Low64MI = MRI->getUniqueVRegDef(Low64MI->getOperand(2).getReg());
+  if (!is64bitDefwithZeroHigh64bit(Low64MI))
+    return false;
+
+  // Check there is `mov 0` MI for high 64-bits.
+  // We are expecting below cases.
+  //
+  //  %2:fpr64 = MOVID 0
+  //  %4:fpr128 = IMPLICIT_DEF
+  //  %3:fpr128 = INSERT_SUBREG %4:fpr128(tied-def 0), killed %2:fpr64, %subreg.dsub
+  //  %7:fpr128 = INSvi64lane %5:fpr128(tied-def 0), 1, killed %3:fpr128, 0
+  // or
+  //  %5:fpr128 = MOVIv2d_ns 0
+  //  %6:fpr64 = COPY %5.dsub:fpr128
+  //  %8:fpr128 = IMPLICIT_DEF
+  //  %7:fpr128 = INSERT_SUBREG %8:fpr128(tied-def 0), killed %6:fpr64, %subreg.dsub
+  //  %11:fpr128 = INSvi64lane %9:fpr128(tied-def 0), 1, killed %7:fpr128, 0
+  MachineInstr *High64MI = MRI->getUniqueVRegDef(MI.getOperand(3).getReg());
+  if (High64MI->getOpcode() != AArch64::INSERT_SUBREG)
+    return false;
+  High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(2).getReg());
+  if (High64MI->getOpcode() == TargetOpcode::COPY)
+    High64MI = MRI->getUniqueVRegDef(High64MI->getOperand(1).getReg());
+  if (High64MI->getOpcode() != AArch64::MOVID &&
+      High64MI->getOpcode() != AArch64::MOVIv2d_ns)
+    return false;
+  if (High64MI->getOperand(1).getImm() != 0)
+    return false;
+
+  // Let's remove MIs for high 64-bits.
+  Register OldDef = MI.getOperand(0).getReg();
+  Register NewDef = MI.getOperand(1).getReg();
+  MRI->replaceRegWith(OldDef, NewDef);
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -667,6 +748,9 @@
       case AArch64::INSvi8gpr:
         Changed = visitINSviGPR(MI, AArch64::INSvi8lane);
         break;
+      case AArch64::INSvi64lane:
+        Changed = visitINSvi64lane(MI);
+        break;
       }
     }
   }
Index: llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) #2
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) #2
+
+define nofpclass(nan inf) <8 x half> @test1(<4 x float> noundef nofpclass(nan inf) %a) {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %vcvt_f16_f321.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a)
+  %0 = bitcast <4 x i16> %vcvt_f16_f321.i to <4 x half>
+  %shuffle.i = shufflevector <4 x half> %0, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %shuffle.i
+}
+
+define <8 x i8> @test2(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone %dst, <8 x i8> noundef %idx) {
+; CHECK-LABEL: test2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #4
+; CHECK-NEXT:    tbl v0.8b, { v1.16b }, v0.8b
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i16>, ptr %in, align 2
+  %1 = lshr <8 x i16> %0, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %vtbl1.i = shufflevector <8 x i8> %vshrn_n, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %idx)
+  ret <8 x i8> %vtbl11.i
+}
+