diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -51,6 +51,9 @@
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
+  bool isCopyFrom32Def(MachineInstr *CopyMI);
+  bool isInsnFrom32Def(MachineInstr *DefInsn);
+  bool isPhiFrom32Def(MachineInstr *MovMI);
   bool isMovFrom32Def(MachineInstr *MovMI);
   bool eliminateZExtSeq(void);
 
@@ -75,42 +78,77 @@
   LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n");
 }
 
-bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
+bool BPFMIPeephole::isCopyFrom32Def(MachineInstr *CopyMI)
 {
-  MachineInstr *DefInsn = MRI->getVRegDef(MovMI->getOperand(1).getReg());
+  MachineOperand &opnd = CopyMI->getOperand(1);
 
-  LLVM_DEBUG(dbgs() << "  Def of Mov Src:");
-  LLVM_DEBUG(DefInsn->dump());
+  if (!opnd.isReg())
+    return false;
 
-  if (!DefInsn)
+  // Return false if getting value from a 32bit physical register.
+  // Most likely, this physical register is aliased to
+  // function call return value or current function parameters.
+  Register Reg = opnd.getReg();
+  if (!Register::isVirtualRegister(Reg))
     return false;
 
-  if (DefInsn->isPHI()) {
-    for (unsigned i = 1, e = DefInsn->getNumOperands(); i < e; i += 2) {
-      MachineOperand &opnd = DefInsn->getOperand(i);
+  if (MRI->getRegClass(Reg) == &BPF::GPRRegClass)
+    return false;
 
-      if (!opnd.isReg())
-        return false;
+  MachineInstr *DefInsn = MRI->getVRegDef(Reg);
+  if (!isInsnFrom32Def(DefInsn))
+    return false;
 
-      MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
-      // quick check on PHI incoming definitions.
-      if (!PhiDef || PhiDef->isPHI() || PhiDef->getOpcode() == BPF::COPY)
-        return false;
-    }
-  }
+  return true;
+}
 
-  if (DefInsn->getOpcode() == BPF::COPY) {
-    MachineOperand &opnd = DefInsn->getOperand(1);
+bool BPFMIPeephole::isPhiFrom32Def(MachineInstr *PhiMI)
+{
+  for (unsigned i = 1, e = PhiMI->getNumOperands(); i < e; i += 2) {
+    MachineOperand &opnd = PhiMI->getOperand(i);
 
     if (!opnd.isReg())
       return false;
 
-    Register Reg = opnd.getReg();
-    if ((Register::isVirtualRegister(Reg) &&
-         MRI->getRegClass(Reg) == &BPF::GPRRegClass))
+    MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg());
+    if (!PhiDef)
+      return false;
+    if (PhiDef->isPHI() && !isPhiFrom32Def(PhiDef))
+      return false;
+    if (PhiDef->getOpcode() == BPF::COPY && !isCopyFrom32Def(PhiDef))
+      return false;
+  }
+
+  return true;
+}
+
+// The \p DefInsn instruction defines a virtual register.
+bool BPFMIPeephole::isInsnFrom32Def(MachineInstr *DefInsn)
+{
+  if (!DefInsn)
+    return false;
+
+  if (DefInsn->isPHI()) {
+    if (!isPhiFrom32Def(DefInsn))
+      return false;
+  } else if (DefInsn->getOpcode() == BPF::COPY) {
+    if (!isCopyFrom32Def(DefInsn))
       return false;
   }
 
+  return true;
+}
+
+bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
+{
+  MachineInstr *DefInsn = MRI->getVRegDef(MovMI->getOperand(1).getReg());
+
+  LLVM_DEBUG(dbgs() << "  Def of Mov Src:");
+  LLVM_DEBUG(DefInsn->dump());
+
+  if (!isInsnFrom32Def(DefInsn))
+    return false;
+
   LLVM_DEBUG(dbgs() << "  One ZExt elim sequence identified.\n");
 
   return true;
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
@@ -55,6 +55,9 @@
   %c.d = select i1 %cmp, i32 %c, i32 %d
   ret i32 %c.d
 }
+; CHECK-LABEL: select_cc_32
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i64 @select_cc_32_64(i32 %a, i32 %b, i64 %c, i64 %d) local_unnamed_addr #0 {
@@ -63,6 +66,9 @@
   %c.d = select i1 %cmp, i64 %c, i64 %d
   ret i64 %c.d
 }
+; CHECK-LABEL: select_cc_32_64
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @select_cc_64_32(i64 %a, i64 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
@@ -71,6 +77,8 @@
   %c.d = select i1 %cmp, i32 %c, i32 %d
   ret i32 %c.d
 }
+; CHECK-LABEL: select_cc_64_32
+; CHECK-NOT: r{{[0-9]+}} <<= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @selecti_cc_32(i32 %a, i32 %c, i32 %d) local_unnamed_addr #0 {
@@ -79,6 +87,9 @@
   %c.d = select i1 %cmp, i32 %c, i32 %d
   ret i32 %c.d
 }
+; CHECK-LABEL: selecti_cc_32
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i64 @selecti_cc_32_64(i32 %a, i64 %c, i64 %d) local_unnamed_addr #0 {
@@ -87,6 +98,9 @@
   %c.d = select i1 %cmp, i64 %c, i64 %d
   ret i64 %c.d
 }
+; CHECK-LABEL: selecti_cc_32_64
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @selecti_cc_64_32(i64 %a, i32 %c, i32 %d) local_unnamed_addr #0 {
@@ -95,6 +109,5 @@
   %c.d = select i1 %cmp, i32 %c, i32 %d
   ret i32 %c.d
 }
-; There shouldn't be any type promotion, all of them are expected to be
-; eliminated by peephole optimization.
+; CHECK-LABEL: selecti_cc_64_32
 ; CHECK-NOT: r{{[0-9]+}} <<= 32
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O2 -march=bpfel -mcpu=v2 -mattr=+alu32 < %s | FileCheck %s
+;
+; For the below test case, 'b' in 'ret == b' needs SLL/SLR.
+; 'ret' in 'ret == b' does not need SLL/SLR as all 'ret' values
+; are assigned through 'w<reg> = <value>' alu32 operations.
+;
+; extern int helper(int);
+; int test(int a, int b, int c, int d) {
+;   int ret;
+;   if (a < b)
+;     ret = (c < d) ? -1 : 0;
+;   else
+;     ret = (c < a) ? 1 : 2;
+;   return helper(ret == b);
+; }
+
+define dso_local i32 @test(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr {
+entry:
+  %cmp = icmp slt i32 %a, %b
+  %cmp1 = icmp slt i32 %c, %d
+  %cond = sext i1 %cmp1 to i32
+  %cmp2 = icmp slt i32 %c, %a
+  %cond3 = select i1 %cmp2, i32 1, i32 2
+  %ret.0 = select i1 %cmp, i32 %cond, i32 %cond3
+  %cmp4 = icmp eq i32 %ret.0, %b
+  %conv = zext i1 %cmp4 to i32
+  %call = tail call i32 @helper(i32 %conv)
+  ret i32 %call
+}
+; CHECK: r{{[0-9]+}} >>= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
+; CHECK: if r{{[0-9]+}} == r{{[0-9]+}} goto
+
+declare dso_local i32 @helper(i32) local_unnamed_addr
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O2 -march=bpfel -mcpu=v2 -mattr=+alu32 < %s | FileCheck %s
+;
+; For the below test case, both 'ret' and 'b' at 'ret == b'
+; need SLL/SLR. For 'ret', 'ret = a' may receive the value
+; from argument with high 32-bit invalid data.
+;
+; extern int helper(int);
+; int test(int a, int b, int c, int d) {
+;   int ret;
+;   if (a < b)
+;     ret = (c < d) ? a : 0;
+;   else
+;     ret = (c < a) ? 1 : 2;
+;   return helper(ret == b);
+; }
+
+define dso_local i32 @test(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr {
+entry:
+  %cmp = icmp slt i32 %a, %b
+  %cmp1 = icmp slt i32 %c, %d
+  %cond = select i1 %cmp1, i32 %a, i32 0
+  %cmp2 = icmp slt i32 %c, %a
+  %cond3 = select i1 %cmp2, i32 1, i32 2
+  %ret.0 = select i1 %cmp, i32 %cond, i32 %cond3
+  %cmp4 = icmp eq i32 %ret.0, %b
+  %conv = zext i1 %cmp4 to i32
+  %call = tail call i32 @helper(i32 %conv)
+  ret i32 %call
+}
+; CHECK: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} >>= 32
+; CHECK: if r{{[0-9]+}} == r{{[0-9]+}} goto
+
+declare dso_local i32 @helper(i32) local_unnamed_addr
diff --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
@@ -47,8 +47,8 @@
 entry:
   %cmp = icmp ugt i32 %a, %b
   %c.d = select i1 %cmp, i64 %c, i64 %d
-; CHECK-NOT: r{{[0-9]+}} <<= 32
-; CHECK-NOT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
 ; CHECK: if r{{[0-9]+}} {{<|>}} r{{[0-9]+}} goto
   ret i64 %c.d
 }
@@ -58,8 +58,8 @@
 ; CHECK-LABEL: select_u_2:
 entry:
   %conv = zext i32 %a to i64
-; CHECK-NOT: r{{[0-9]+}} <<= 32
-; CHECK-NOT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
   %cmp = icmp ugt i64 %conv, %b
   %c.d = select i1 %cmp, i64 %c, i64 %d
   ret i64 %c.d
@@ -100,8 +100,23 @@
 ; CHECK-LABEL: inc_p:
 entry:
   %idx.ext = zext i32 %a to i64
-; CHECK-NOT: r{{[0-9]+}} <<= 32
-; CHECK-NOT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} >>= 32
   %add.ptr = getelementptr inbounds i32, i32* %p, i64 %idx.ext
   ret i32* %add.ptr
 }
+
+define dso_local i32 @test() local_unnamed_addr {
+; CHECK-LABEL: test:
+entry:
+  %call = tail call i32 bitcast (i32 (...)* @helper to i32 ()*)()
+  %cmp = icmp sgt i32 %call, 6
+; The shifts can't be optimized out because %call comes from function call
+; return i32 so the high bits might be invalid.
+; CHECK: r{{[0-9]+}} <<= 32
+; CHECK-NEXT: r{{[0-9]+}} s>>= 32
+  %cond = zext i1 %cmp to i32
+; CHECK: if r{{[0-9]+}} s{{<|>}} {{[0-9]+}} goto
+  ret i32 %cond
+}
+declare dso_local i32 @helper(...) local_unnamed_addr