Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -331,7 +331,7 @@ areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override; - bool isFoldableCopy(const MachineInstr &MI) const; + static bool isFoldableCopy(const MachineInstr &MI); bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2701,7 +2701,7 @@ } } -bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { +bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: @@ -3079,16 +3079,24 @@ return false; } -static int64_t getFoldableImm(const MachineOperand* MO) { +static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, + int64_t &Imm) { + if (Reg.isPhysical()) + return false; + auto *Def = MRI.getUniqueVRegDef(Reg); + if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { + Imm = Def->getOperand(1).getImm(); + return true; + } + return false; +} + +static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) { if (!MO->isReg()) return false; const MachineFunction *MF = MO->getParent()->getParent()->getParent(); const MachineRegisterInfo &MRI = MF->getRegInfo(); - auto Def = MRI.getUniqueVRegDef(MO->getReg()); - if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && - Def->getOperand(1).isImm()) - return Def->getOperand(1).getImm(); - return AMDGPU::NoRegister; + return getFoldableImm(MO->getReg(), MRI, Imm); } static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, @@ -3160,7 +3168,8 @@ // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { - if (auto Imm = getFoldableImm(Src2)) { + int64_t Imm; + if (getFoldableImm(Src2, Imm)) { unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); @@ -3177,7 +3186,7 @@ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); - if (auto Imm = getFoldableImm(Src1)) { + if (getFoldableImm(Src1, Imm)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -3188,7 +3197,7 @@ return MIB; } } - if (auto Imm = getFoldableImm(Src0)) { + if (getFoldableImm(Src0, Imm)) { if (pseudoToMCOpcode(NewOpc) != -1 && isOperandLegal( MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), @@ -8004,7 +8013,10 @@ Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const { - if (SrcReg2 || SrcReg.isPhysical()) + if (!SrcReg || SrcReg.isPhysical()) + return false; + + if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue)) return false; const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI, @@ -8049,10 +8061,21 @@ Def->getOpcode() != AMDGPU::S_AND_B64) return false; + const auto isMask = [](const MachineOperand *MO) -> bool { + int64_t Mask; + if (MO->isImm()) + Mask = MO->getImm(); + else if (!getFoldableImm(MO, Mask)) + return false; + return Mask == 1; + }; + MachineOperand *SrcOp = &Def->getOperand(1); - if (SrcOp->isImm() && SrcOp->getImm() == 1) + if (isMask(SrcOp)) SrcOp = &Def->getOperand(2); - else if (!Def->getOperand(2).isImm() || Def->getOperand(2).getImm() != 1) + else if (isMask(&Def->getOperand(2))) + SrcOp = &Def->getOperand(1); + else return false; Register DefReg = Def->getOperand(0).getReg(); Index: llvm/test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -32,8 +32,8 @@ ; GCN: s_load_dword [[VAL:s[0-9]+]] ; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}} ; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]] -; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1 -; GCN: s_cmp_eq_u32 +; GCNOPT: s_bitcmp0_b32 [[VAL]], 0 +; GCNNOOPT: s_cmp_eq_u32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] ; GCN: buffer_store_dword Index: llvm/test/CodeGen/AMDGPU/optimize-compare.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -1340,3 +1340,105 @@ S_ENDPGM 0 ... + +--- +name: and_1_folded_src0_cmp_eq_u32_1_folded_src2 +body: | + ; GCN-LABEL: name: and_1_folded_src0_cmp_eq_u32_1_folded_src2 + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc + ; GCN: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN: S_BRANCH %bb.1 + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_MOV_B32 1 + %2:sreg_32 = S_AND_B32 %1, killed %0, implicit-def dead $scc + S_CMP_EQ_U32 killed %2:sreg_32, %1, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... + +--- +name: and_1_folded_src1_cmp_eq_u32_1_folded_src2 +body: | + ; GCN-LABEL: name: and_1_folded_src1_cmp_eq_u32_1_folded_src2 + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GCN: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc + ; GCN: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN: S_BRANCH %bb.1 + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0, $vgpr0_vgpr1 + + %0:sreg_32 = COPY $sgpr0 + %1:sreg_32 = S_MOV_B32 1 + %2:sreg_32 = S_AND_B32 killed %0, %1, implicit-def dead $scc + S_CMP_EQ_U32 killed %2:sreg_32, %1, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... + +--- +name: and_1_folded_src1_cmp_eq_u64_1_folded_src2 +body: | + ; GCN-LABEL: name: and_1_folded_src1_cmp_eq_u64_1_folded_src2 + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1 + ; GCN: S_BITCMP1_B64 killed [[COPY]], 0, implicit-def $scc + ; GCN: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN: S_BRANCH %bb.1 + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: bb.2: + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $sgpr0_sgpr1, $vgpr0_vgpr1 + + %0:sreg_64 = COPY $sgpr0_sgpr1 + %1:sreg_64 = S_MOV_B64 1 + %2:sreg_64 = S_AND_B64 killed %0, %1, implicit-def dead $scc + S_CMP_EQ_U64 killed %2:sreg_64, %1, implicit-def $scc + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + + bb.2: + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/setcc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/setcc.ll +++ llvm/test/CodeGen/AMDGPU/setcc.ll @@ -380,8 +380,7 @@ ; Make sure we don't try to emit i1 setcc ops ; FUNC-LABEL: setcc-i1 -; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1 -; GCN: s_cmp_eq_u32 [[AND]], 0 +; GCN: s_bitcmp0_b32 s{{[0-9]+}}, 0 define amdgpu_kernel void @setcc-i1(i32 %in) #0 { %and = and i32 %in, 1 %cmp = icmp eq i32 %and, 0 Index: llvm/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave32.ll +++ llvm/test/CodeGen/AMDGPU/wave32.ll @@ -521,7 +521,7 @@ } ; GCN-LABEL: {{^}}test_brcc_i1: -; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 +; GCN: s_bitcmp0_b32 s{{[0-9]+}}, 0 ; GCN-NEXT: s_cbranch_scc1 define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { %cmp0 = icmp ne i1 %val, 0