diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -23,14 +23,12 @@
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
 FunctionPass *createBPFMIPeepholePass();
-FunctionPass *createBPFMIPeepholeTruncElimPass();
 FunctionPass *createBPFMIPreEmitPeepholePass();
 FunctionPass *createBPFMIPreEmitCheckingPass();
 
 void initializeBPFCheckAndAdjustIRPass(PassRegistry&);
 void initializeBPFDAGToDAGISelPass(PassRegistry &);
-void initializeBPFMIPeepholePass(PassRegistry&);
-void initializeBPFMIPeepholeTruncElimPass(PassRegistry &);
+void initializeBPFMIPeepholePass(PassRegistry &);
 void initializeBPFMIPreEmitCheckingPass(PassRegistry&);
 void initializeBPFMIPreEmitPeepholePass(PassRegistry &);
 void initializeBPFMISimplifyPatchablePass(PassRegistry &);
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -144,6 +144,7 @@
   // For 32bit ALU result zext to 64bit is free.
   bool isZExtFree(Type *Ty1, Type *Ty2) const override;
   bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
 
   unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
                          bool isSigned) const;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -224,6 +224,18 @@
   return NumBits1 == 32 && NumBits2 == 64;
 }
 
+bool BPFTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (Val.getOpcode() == ISD::LOAD && VT1.isSimple() && VT2.isSimple()) {
+    MVT MT1 = VT1.getSimpleVT().SimpleTy;
+    MVT MT2 = VT2.getSimpleVT().SimpleTy;
+    if ((MT1 == MVT::i8 || MT1 == MVT::i16 || MT1 == MVT::i32) &&
+        (MT2 == MVT::i32 || MT2 == MVT::i64))
+      return true;
+  }
+  return TargetLoweringBase::isZExtFree(Val, VT2);
+}
+
 BPFTargetLowering::ConstraintType
 BPFTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -606,180 +606,3 @@
 {
   return new BPFMIPreEmitPeephole();
 }
-
-STATISTIC(TruncElemNum, "Number of truncation eliminated");
-
-namespace {
-
-struct BPFMIPeepholeTruncElim : public MachineFunctionPass {
-
-  static char ID;
-  const BPFInstrInfo *TII;
-  MachineFunction *MF;
-  MachineRegisterInfo *MRI;
-
-  BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) {
-    initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry());
-  }
-
-private:
-  // Initialize class variables.
-  void initialize(MachineFunction &MFParm);
-
-  bool eliminateTruncSeq();
-
-public:
-
-  // Main entry point for this pass.
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    if (skipFunction(MF.getFunction()))
-      return false;
-
-    initialize(MF);
-
-    return eliminateTruncSeq();
-  }
-};
-
-static bool TruncSizeCompatible(int TruncSize, unsigned opcode)
-{
-  if (TruncSize == 1)
-    return opcode == BPF::LDB || opcode == BPF::LDB32;
-
-  if (TruncSize == 2)
-    return opcode == BPF::LDH || opcode == BPF::LDH32;
-
-  if (TruncSize == 4)
-    return opcode == BPF::LDW || opcode == BPF::LDW32;
-
-  return false;
-}
-
-// Initialize class variables.
-void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) {
-  MF = &MFParm;
-  MRI = &MF->getRegInfo();
-  TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
-  LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n");
-}
-
-// Reg truncating is often the result of 8/16/32bit->64bit or
-// 8/16bit->32bit conversion. If the reg value is loaded with
-// masked byte width, the AND operation can be removed since
-// BPF LOAD already has zero extension.
-//
-// This also solved a correctness issue.
-// In BPF socket-related program, e.g., __sk_buff->{data, data_end}
-// are 32-bit registers, but later on, kernel verifier will rewrite
-// it with 64-bit value. Therefore, truncating the value after the
-// load will result in incorrect code.
-bool BPFMIPeepholeTruncElim::eliminateTruncSeq() {
-  MachineInstr* ToErase = nullptr;
-  bool Eliminated = false;
-
-  for (MachineBasicBlock &MBB : *MF) {
-    for (MachineInstr &MI : MBB) {
-      // The second insn to remove if the eliminate candidate is a pair.
-      MachineInstr *MI2 = nullptr;
-      Register DstReg, SrcReg;
-      MachineInstr *DefMI;
-      int TruncSize = -1;
-
-      // If the previous instruction was marked for elimination, remove it now.
-      if (ToErase) {
-        ToErase->eraseFromParent();
-        ToErase = nullptr;
-      }
-
-      // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate
-      // for BPF ANDI is i32, and this case only happens on ALU64.
-      if (MI.getOpcode() == BPF::SRL_ri &&
-          MI.getOperand(2).getImm() == 32) {
-        SrcReg = MI.getOperand(1).getReg();
-        if (!MRI->hasOneNonDBGUse(SrcReg))
-          continue;
-
-        MI2 = MRI->getVRegDef(SrcReg);
-        DstReg = MI.getOperand(0).getReg();
-
-        if (!MI2 ||
-            MI2->getOpcode() != BPF::SLL_ri ||
-            MI2->getOperand(2).getImm() != 32)
-          continue;
-
-        // Update SrcReg.
-        SrcReg = MI2->getOperand(1).getReg();
-        DefMI = MRI->getVRegDef(SrcReg);
-        if (DefMI)
-          TruncSize = 4;
-      } else if (MI.getOpcode() == BPF::AND_ri ||
-                 MI.getOpcode() == BPF::AND_ri_32) {
-        SrcReg = MI.getOperand(1).getReg();
-        DstReg = MI.getOperand(0).getReg();
-        DefMI = MRI->getVRegDef(SrcReg);
-
-        if (!DefMI)
-          continue;
-
-        int64_t imm = MI.getOperand(2).getImm();
-        if (imm == 0xff)
-          TruncSize = 1;
-        else if (imm == 0xffff)
-          TruncSize = 2;
-      }
-
-      if (TruncSize == -1)
-        continue;
-
-      // The definition is PHI node, check all inputs.
-      if (DefMI->isPHI()) {
-        bool CheckFail = false;
-
-        for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
-          MachineOperand &opnd = DefMI->getOperand(i);
-          if (!opnd.isReg()) {
-            CheckFail = true;
-            break;
-          }
-
-          MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
-          if (!PhiDef || PhiDef->isPHI() ||
-              !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) {
-            CheckFail = true;
-            break;
-          }
-        }
-
-        if (CheckFail)
-          continue;
-      } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) {
-        continue;
-      }
-
-      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg)
-              .addReg(SrcReg);
-
-      if (MI2)
-        MI2->eraseFromParent();
-
-      // Mark it to ToErase, and erase in the next iteration.
-      ToErase = &MI;
-      TruncElemNum++;
-      Eliminated = true;
-    }
-  }
-
-  return Eliminated;
-}
-
-} // end default namespace
-
-INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim",
-                "BPF MachineSSA Peephole Optimization For TRUNC Eliminate",
-                false, false)
-
-char BPFMIPeepholeTruncElim::ID = 0;
-FunctionPass* llvm::createBPFMIPeepholeTruncElimPass()
-{
-  return new BPFMIPeepholeTruncElim();
-}
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -42,7 +42,6 @@
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeBPFCheckAndAdjustIRPass(PR);
   initializeBPFMIPeepholePass(PR);
-  initializeBPFMIPeepholeTruncElimPass(PR);
   initializeBPFDAGToDAGISelPass(PR);
 }
 
@@ -155,7 +154,6 @@
   if (!DisableMIPeephole) {
     if (Subtarget->getHasAlu32())
       addPass(createBPFMIPeepholePass());
-    addPass(createBPFMIPeepholeTruncElimPass());
   }
 }
 
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_9.ll b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mcpu=v2 -march=bpf < %s | FileCheck %s
+; RUN: llc -mcpu=v4 -march=bpf < %s | FileCheck %s
+
+; Zero extension instructions should be eliminated at instruction
+; selection phase for all test cases below.
+
+; In BPF zero extension is implemented as &= or a pair of <<=/>>=
+; instructions, hence simply check that &= and >>= do not exist in
+; generated code (<<= remains because %c is used by both call and
+; lshr in a few test cases).
+
+; CHECK-NOT: &=
+; CHECK-NOT: >>=
+
+define void @shl_lshr_same_bb(ptr %p) {
+entry:
+  %a = load i8, ptr %p, align 1
+  %b = zext i8 %a to i64
+  %c = shl i64 %b, 56
+  %d = lshr i64 %c, 56
+  %e = icmp eq i64 %d, 0
+  ; hasOneUse() is a common requirement for many CombineDAG
+  ; transofmations, make sure that it does not matter in this case.
+  call void @sink1(i8 %a, i64 %b, i64 %c, i64 %d, i1 %e)
+  ret void
+}
+
+define void @shl_lshr_diff_bb(ptr %p) {
+entry:
+  %a = load i16, ptr %p, align 2
+  %b = zext i16 %a to i64
+  %c = shl i64 %b, 48
+  %d = lshr i64 %c, 48
+  br label %next
+
+; Jump to the new basic block creates a COPY instruction for %d, which
+; might be materialized as noop or as AND_ri (zero extension) at the
+; start of the basic block. The decision depends on TLI.isZExtFree()
+; results, see RegsForValue::getCopyToRegs(). Check below verifies
+; that COPY is materialized as noop.
+next:
+  %e = icmp eq i64 %d, 0
+  call void @sink2(i16 %a, i64 %b, i64 %c, i64 %d, i1 %e)
+  ret void
+}
+
+define void @load_zext_same_bb(ptr %p) {
+entry:
+  %a = load i8, ptr %p, align 1
+  ; zext is implicit in this context
+  %b = icmp eq i8 %a, 0
+  call void @sink3(i8 %a, i1 %b)
+  ret void
+}
+
+define void @load_zext_diff_bb(ptr %p) {
+entry:
+  %a = load i8, ptr %p, align 1
+  br label %next
+
+next:
+  %b = icmp eq i8 %a, 0
+  call void @sink3(i8 %a, i1 %b)
+  ret void
+}
+
+define void @load_zext_diff_bb_2(ptr %p) {
+entry:
+  %a = load i32, ptr %p, align 4
+  br label %next
+
+next:
+  %b = icmp eq i32 %a, 0
+  call void @sink4(i32 %a, i1 %b)
+  ret void
+}
+
+declare void @sink1(i8, i64, i64, i64, i1);
+declare void @sink2(i16, i64, i64, i64, i1);
+declare void @sink3(i8, i1);
+declare void @sink4(i32, i1);