diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -312,6 +312,19 @@ return false; } +static void appendFoldCandidate(SmallVectorImpl &FoldList, + MachineInstr *MI, unsigned OpNo, + MachineOperand *FoldOp, bool Commuted = false, + int ShrinkOp = -1) { + // Skip additional folding on the same operand. + for (FoldCandidate &Fold : FoldList) + if (Fold.UseMI == MI && Fold.UseOpNo == OpNo) + return; + LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal") + << " operand " << OpNo << "\n " << *MI << '\n'); + FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); +} + static bool tryAddToFoldList(SmallVectorImpl &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, @@ -344,7 +357,7 @@ // Special case for s_setreg_b32 if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) { MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32)); - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -403,8 +416,7 @@ unsigned MaybeCommutedOpc = MI->getOpcode(); int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); - FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, - Op32)); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32); return true; } @@ -412,11 +424,11 @@ return false; } - FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true)); + appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true); return true; } - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); + appendFoldCandidate(FoldList, MI, OpNo, OpToFold); return true; } @@ -494,7 +506,7 @@ if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) return false; - FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + appendFoldCandidate(FoldList, UseMI, UseOpIdx, Op); return true; } @@ -1398,5 +1410,5 @@ foldInstOperand(MI, OpToFold); } } - return false; + return true; } diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -22,3 +22,21 @@ %9:vgpr_32 = COPY %8 %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec ... + +--- +# GCN-LABEL: name: no_extra_fold_on_same_opnd +# The first XOR needs commuting to fold that immediate operand. +# GCN: V_XOR_B32_e32 {{.*}} 0, %1 +# GCN: V_XOR_B32_e32 %2, %4.sub0 +name: no_extra_fold_on_same_opnd +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vreg_64 = REG_SEQUENCE killed %0, %subreg.sub0, killed %3, %subreg.sub1 + %5:vgpr_32 = V_XOR_B32_e32 %1, %4.sub1, implicit $exec + %6:vgpr_32 = V_XOR_B32_e32 %2, %4.sub0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -124,6 +124,30 @@ ret void } +; There should be exact one folding on the same operand. +; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @no_extra_fold_on_same_opnd() { +entry: + %s0 = load i32, i32 addrspace(5)* undef, align 4 + %s0.i64= zext i32 %s0 to i64 + br label %for.body.i.i + +for.body.i.i: + %s1 = load i32, i32 addrspace(1)* undef, align 8 + %s1.i64 = sext i32 %s1 to i64 + %xor = xor i64 %s1.i64, %s0.i64 + %flag = icmp ult i64 %xor, 8 + br i1 %flag, label %if.then, label %if.else + +if.then: + unreachable + +if.else: + unreachable +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone }