diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -312,6 +312,19 @@
   return false;
 }
 
+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+                                MachineInstr *MI, unsigned OpNo,
+                                MachineOperand *FoldOp, bool Commuted = false,
+                                int ShrinkOp = -1) {
+  // Skip additional folding on the same operand.
+  for (FoldCandidate &Fold : FoldList)
+    if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
+      return;
+  LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
+                    << " operand " << OpNo << "\n  " << *MI << '\n');
+  FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+}
+
 static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                              MachineInstr *MI, unsigned OpNo,
                              MachineOperand *OpToFold,
@@ -344,7 +357,7 @@
     // Special case for s_setreg_b32
     if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
       MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
-      FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+      appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
       return true;
     }
 
@@ -403,8 +416,7 @@
         unsigned MaybeCommutedOpc = MI->getOpcode();
         int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
 
-        FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
-                                         Op32));
+        appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
         return true;
       }
 
@@ -412,11 +424,11 @@
       return false;
     }
 
-    FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
+    appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
     return true;
   }
 
-  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+  appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
   return true;
 }
 
@@ -494,7 +506,7 @@
   if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
     return false;
 
-  FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
+  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Op);
   return true;
 }
 
@@ -1398,5 +1410,5 @@
       foldInstOperand(MI, OpToFold);
     }
   }
-  return false;
+  return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
@@ -22,3 +22,21 @@
     %9:vgpr_32 = COPY %8
     %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec
 ...
+
+---
+# GCN-LABEL:       name: no_extra_fold_on_same_opnd
+# The first XOR needs commuting to fold that immediate operand.
+# GCN:             V_XOR_B32_e32 {{.*}} 0, %1
+# GCN:             V_XOR_B32_e32 %2, %4.sub0
+name: no_extra_fold_on_same_opnd
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %0:vgpr_32  = IMPLICIT_DEF
+    %1:vgpr_32  = IMPLICIT_DEF
+    %2:vgpr_32  = IMPLICIT_DEF
+    %3:vgpr_32  = V_MOV_B32_e32 0, implicit $exec
+    %4:vreg_64  = REG_SEQUENCE killed %0, %subreg.sub0, killed %3, %subreg.sub1
+    %5:vgpr_32  = V_XOR_B32_e32 %1, %4.sub1, implicit $exec
+    %6:vgpr_32  = V_XOR_B32_e32 %2, %4.sub0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -124,6 +124,30 @@
   ret void
 }
 
+; There should be exact one folding on the same operand.
+; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define void @no_extra_fold_on_same_opnd() {
+entry:
+  %s0 = load i32, i32 addrspace(5)* undef, align 4
+  %s0.i64= zext i32 %s0 to i64
+  br label %for.body.i.i
+
+for.body.i.i:
+  %s1 = load i32, i32 addrspace(1)* undef, align 8
+  %s1.i64 = sext i32 %s1 to i64
+  %xor = xor i64 %s1.i64, %s0.i64
+  %flag = icmp ult i64 %xor, 8
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  unreachable
+
+if.else:
+  unreachable
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }