Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -73,6 +73,7 @@ MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl &FoldList, + SmallSet &CommutedList, SmallVectorImpl &CopiesToReplace) const; void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; @@ -177,6 +178,7 @@ } static bool tryAddToFoldList(SmallVectorImpl &FoldList, + SmallSet &CommutedList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { @@ -191,7 +193,8 @@ // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); - bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); + bool FoldAsMAD = tryAddToFoldList(FoldList, CommutedList, MI, OpNo, + OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); return true; @@ -237,8 +240,12 @@ !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; + } + + CommutedList.insert(MI); } FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); @@ -259,6 +266,7 @@ MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl &FoldList, + SmallSet &CommutedList, SmallVectorImpl &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); @@ -299,7 +307,7 @@ continue; foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, - CopiesToReplace); + CommutedList, CopiesToReplace); } return; @@ -334,7 +342,7 @@ } if (!FoldingImm) { - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + tryAddToFoldList(FoldList, CommutedList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunites. The shrink operands pass @@ -368,13 +376,13 @@ } MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + tryAddToFoldList(FoldList, CommutedList, UseMI, UseOpIdx, &ImmOp, TII); return; } - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + tryAddToFoldList(FoldList, CommutedList, UseMI, UseOpIdx, &OpToFold, TII); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, @@ -516,21 +524,27 @@ if (!MI->isCommutable()) return false; - if (Src0->isImm() && !Src1->isImm()) { - std::swap(Src0, Src1); - std::swap(Src0Idx, Src1Idx); - } + auto trySwapOperands = [&Src0, &Src1, &Src0Idx, &Src1Idx] { + if (Src0->isImm() && !Src1->isImm()) { + std::swap(Src0, Src1); + std::swap(Src0Idx, Src1Idx); + } + }; + + auto SrcImm = (Src0->isImm() && !Src1->isImm()) ? Src0 : Src1; + int32_t Src1Val = static_cast(SrcImm->getImm()); - int32_t Src1Val = static_cast(Src1->getImm()); if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::V_OR_B32_e32 || Opc == AMDGPU::S_OR_B32) { if (Src1Val == 0) { // y = or x, 0 => y = copy x + trySwapOperands(); MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); } else if (Src1Val == -1) { // y = or x, -1 => y = v_mov_b32 -1 + trySwapOperands(); MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32))); } else @@ -544,10 +558,12 @@ MI->getOpcode() == AMDGPU::S_AND_B32) { if (Src1Val == 0) { // y = and x, 0 => y = v_mov_b32 0 + trySwapOperands(); MI->RemoveOperand(Src0Idx); mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32))); } else if (Src1Val == -1) { // y = and x, -1 => y = copy x + trySwapOperands(); MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); stripExtraCopyOperands(*MI); @@ -562,6 +578,7 @@ MI->getOpcode() == AMDGPU::S_XOR_B32) { if (Src1Val == 0) { // y = xor x, 0 => y = copy x + trySwapOperands(); MI->RemoveOperand(Src1Idx); mutateCopyOp(*MI, TII->get(AMDGPU::COPY)); return true; @@ -604,6 +621,7 @@ // this. SmallVector CopiesToReplace; SmallVector FoldList; + SmallSet CommutedList; MachineOperand &Dst = MI.getOperand(0); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); @@ -656,7 +674,8 @@ // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + foldOperand(OpToFold, UseMI, OpNo, FoldList, CommutedList, + CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; @@ -667,7 +686,8 @@ if (NumLiteralUses == 1) { MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); + foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CommutedList, + CopiesToReplace); } } else { // Folding register. @@ -677,7 +697,7 @@ MachineInstr *UseMI = Use->getParent(); foldOperand(OpToFold, UseMI, Use.getOperandNo(), - FoldList, CopiesToReplace); + FoldList, CommutedList, CopiesToReplace); } } @@ -699,6 +719,11 @@ DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); tryFoldInst(TII, Fold.UseMI); + } else if (CommutedList.count(Fold.UseMI)) { + unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; + TII->findCommutedOpIndices(*Fold.UseMI, CommuteIdx0, CommuteIdx1); + TII->commuteInstruction(*Fold.UseMI, false, CommuteIdx0, CommuteIdx1); } } } Index: test/CodeGen/AMDGPU/commute-compares.ll =================================================================== --- test/CodeGen/AMDGPU/commute-compares.ll +++ test/CodeGen/AMDGPU/commute-compares.ll @@ -35,7 +35,7 @@ ; FIXME: Why isn't this being folded as a constant? ; GCN-LABEL: {{^}}commute_ne_litk_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039 -; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}} +; GCN: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -99,11 +99,9 @@ ret void } -; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm - ; GCN-LABEL: {{^}}commute_ule_64_i32: ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}} -; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}} +; GCN: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, [[K]] define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -702,7 +700,7 @@ ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} -; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} +; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]] define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { entry: %stack0 = alloca i32 Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -421,11 +421,10 @@ } ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: -; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GCN: flat_load_dword [[IDX:v[0-9]+]] ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -449,11 +448,10 @@ } ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: -; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GCN: flat_load_dword [[IDX:v[0-9]+]] ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] Index: test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- test/CodeGen/AMDGPU/sub.i16.ll +++ test/CodeGen/AMDGPU/sub.i16.ll @@ -85,9 +85,9 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: +; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {