diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -97,6 +97,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; MachineBasicBlock *processPHINode(MachineInstr &MI); + MachineBasicBlock::iterator processVNOT(MachineInstr &MI); StringRef getPassName() const override { return "SI Fix SGPR copies"; } @@ -579,6 +580,9 @@ switch (MI.getOpcode()) { default: continue; + case AMDGPU::V_NOT_B32_e32: { + I = processVNOT(MI); + } break; case AMDGPU::COPY: case AMDGPU::WQM: case AMDGPU::STRICT_WQM: @@ -916,3 +920,187 @@ } return CreatedBB; } + +MachineBasicBlock::iterator SIFixSGPRCopies::processVNOT(MachineInstr &MI) { + MachineBasicBlock::iterator I(MI); + Register SrcReg = MI.getOperand(1).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(SrcReg); + // We rely on the VOP2Instruction.td pattern that + // always select _e64 form. + if ((DefMI->getOpcode() == AMDGPU::V_XOR_B32_e64) && + (MRI->hasOneUse(DefMI->getOperand(0).getReg()))) { + MachineOperand Src1 = DefMI->getOperand(1); + MachineOperand Src2 = DefMI->getOperand(2); + if (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg())) { + Register R1 = Src1.getReg(); + Register NotR1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*DefMI->getParent(), DefMI, DefMI->getDebugLoc(), + TII->get(AMDGPU::S_NOT_B32), NotR1) + .addReg(R1); + DefMI->getOperand(1).setReg(NotR1); + MRI->replaceRegWith(MI.getOperand(0).getReg(), + DefMI->getOperand(0).getReg()); + I++; + MI.eraseFromParent(); + } else if (Src2.isReg() && TRI->isSGPRReg(*MRI, Src2.getReg())) { + Register R2 = Src2.getReg(); + Register NotR2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*DefMI->getParent(), DefMI, DefMI->getDebugLoc(), + TII->get(AMDGPU::S_NOT_B32), NotR2) + .addReg(R2); + DefMI->getOperand(2).setReg(NotR2); + MRI->replaceRegWith(MI.getOperand(0).getReg(), + DefMI->getOperand(0).getReg()); + I++; + MI.eraseFromParent(); + } + } else if (DefMI->getOpcode() == AMDGPU::COPY) { + Register CopySource = DefMI->getOperand(1).getReg(); + if (DefMI->getOperand(1).getSubReg() != AMDGPU::NoSubRegister) { + MachineInstr *SuperRegDef = MRI->getVRegDef(CopySource); + if ((SuperRegDef->getOpcode() == AMDGPU::REG_SEQUENCE) && + SuperRegDef->getNumOperands() == 5) { + MachineInstr *LODef = + MRI->getVRegDef(SuperRegDef->getOperand(1).getReg()); + MachineInstr *HIDef = + MRI->getVRegDef(SuperRegDef->getOperand(3).getReg()); + // We rely on the divergent_i64_BinOp definition in + // VOP2Instructions.td that always select the _e64 form. + if (LODef->getOpcode() == AMDGPU::V_XOR_B32_e64 && + HIDef->getOpcode() == AMDGPU::V_XOR_B32_e64) { + MachineOperand LOSrc1 = LODef->getOperand(1); + MachineOperand HISrc1 = HIDef->getOperand(1); + MachineOperand LOSrc2 = LODef->getOperand(2); + MachineOperand HISrc2 = HIDef->getOperand(2); + + Register LOReg, HIReg = AMDGPU::NoRegister; + unsigned LODefIdx, HIDefIdx; + if (LOSrc1.isReg() && TRI->isSGPRReg(*MRI, LOSrc1.getReg())) { + LOReg = LOSrc1.getReg(); + LODefIdx = 1; + if (HISrc1.isReg() && TRI->isSGPRReg(*MRI, HISrc1.getReg())) { + HIReg = HISrc1.getReg(); + HIDefIdx = 1; + } else if (HISrc2.isReg() && + TRI->isSGPRReg(*MRI, HISrc2.getReg())) { + HIReg = HISrc2.getReg(); + HIDefIdx = 2; + } + } else if (LOSrc2.isReg() && TRI->isSGPRReg(*MRI, LOSrc2.getReg())) { + LOReg = LOSrc2.getReg(); + LODefIdx = 2; + if (HISrc1.isReg() && TRI->isSGPRReg(*MRI, HISrc1.getReg())) { + HIReg = HISrc1.getReg(); + HIDefIdx = 1; + } else if (HISrc2.isReg() && + TRI->isSGPRReg(*MRI, HISrc2.getReg())) { + HIReg = HISrc2.getReg(); + HIDefIdx = 2; + } + } + if (LOReg != AMDGPU::NoRegister && HIReg != AMDGPU::NoRegister) { + MachineInstr *LORegDef = MRI->getVRegDef(LOReg); + MachineInstr *HIRegDef = MRI->getVRegDef(HIReg); + if (LORegDef->isCopy() && HIRegDef->isCopy() && + (LORegDef->getOperand(1).getSubReg() != + AMDGPU::NoSubRegister) && + (HIRegDef->getOperand(1).getSubReg() != + AMDGPU::NoSubRegister) && + LORegDef->getOperand(1).getReg() == + HIRegDef->getOperand(1).getReg()) { + Register S64Source = LORegDef->getOperand(1).getReg(); + MachineInstr *S64DefMI = MRI->getVRegDef(S64Source); + assert(TRI->getRegClassForReg(*MRI, S64Source) == + &AMDGPU::SReg_64RegClass); + SmallVector Uses; + for (auto &U : MRI->use_instructions(CopySource)) + Uses.push_back(&U); + if (Uses.size() == 2) { + MachineInstr *AnotherCopy = nullptr; + while (!Uses.empty()) { + MachineInstr *U = Uses.pop_back_val(); + if (U == DefMI) + continue; + if (U->isCopy() && MRI->hasOneUse(U->getOperand(0).getReg())) + AnotherCopy = U; + } + if (AnotherCopy && + MRI->use_instructions(AnotherCopy->getOperand(0).getReg()) + .begin() + ->getOpcode() == AMDGPU::V_NOT_B32_e32) { + MachineInstr *AnotherNot = + &*MRI->use_instructions( + AnotherCopy->getOperand(0).getReg()) + .begin(); + MachineInstr *LONot, *HINot = nullptr; + if (DefMI->getOperand(1).getSubReg() == AMDGPU::sub0 && + AnotherCopy->getOperand(1).getSubReg() == AMDGPU::sub1) { + LONot = &MI; + HINot = AnotherNot; + } else if (DefMI->getOperand(1).getSubReg() == AMDGPU::sub1 && + AnotherCopy->getOperand(1).getSubReg() == + AMDGPU::sub0) { + HINot = &MI; + LONot = AnotherNot; + } + if (LONot && HINot) { + Register S64NotReg = + MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineBasicBlock::iterator InsPt = + std::next(MachineBasicBlock::iterator(S64DefMI)); + MachineInstr *S64Not = + BuildMI(*InsPt->getParent(), InsPt, + InsPt->getDebugLoc(), + TII->get(AMDGPU::S_NOT_B64), S64NotReg) + .addReg(S64Source); + InsPt = std::next(MachineBasicBlock::iterator(S64Not)); + Register SrcLO = TII->buildExtractSubReg( + InsPt, *MRI, S64Not->getOperand(0), + &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32RegClass); + Register SrcHI = TII->buildExtractSubReg( + InsPt, *MRI, S64Not->getOperand(0), + &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32RegClass); + LODef->getOperand(LODefIdx).setReg(SrcLO); + HIDef->getOperand(HIDefIdx).setReg(SrcHI); + MRI->replaceRegWith(LONot->getOperand(0).getReg(), + LODef->getOperand(0).getReg()); + MRI->replaceRegWith(HINot->getOperand(0).getReg(), + HIDef->getOperand(0).getReg()); + + I++; + MI.eraseFromParent(); + if (I == AnotherNot) + I++; + AnotherNot->eraseFromParent(); + if (MRI->use_instructions(DefMI->getOperand(0).getReg()) + .empty()) { + if (I == DefMI) + I++; + DefMI->eraseFromParent(); + } + if (MRI->use_instructions( + AnotherCopy->getOperand(0).getReg()) + .empty()) { + if (I == AnotherCopy) + I++; + AnotherCopy->eraseFromParent(); + } + if (MRI->use_instructions( + SuperRegDef->getOperand(0).getReg()) + .empty()) + if (I == SuperRegDef) + I++; + SuperRegDef->eraseFromParent(); + } + } + } + } + } + } + } + } + } + return I; +} diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -551,11 +551,11 @@ >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", - [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] + [(set i32:$sdst, (UniformUnaryFrag (xor_oneuse i32:$src0, i32:$src1)))] >; def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", - [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] + [(set i64:$sdst, (UniformUnaryFrag (xor_oneuse i64:$src0, i64:$src1)))] >; def S_NAND_B32 : SOP2_32 <"s_nand_b32", diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -637,9 +637,9 @@ ) >; -def : divergent_i64_BinOp ; -def : divergent_i64_BinOp ; -def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; let SubtargetPredicate = Has16BitInsts in { @@ -688,6 +688,21 @@ let isReMaterializable = 1 in defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; +def : GCNPat< + (i32 (DivergentUnaryFrag (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s + +; GCN-LABEL: name: uniform_xnor_i64 +; GCN: S_XNOR_B64 +define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + store i64 %res, i64 addrspace(1)* %out + ret void +} +; GCN-LABEL: name: divergent_xnor_i64 +; GCN: V_XOR_B32_e64 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e32 +; GCN: V_NOT_B32_e32 +; GCN_DL: V_XNOR_B32_e64 +; GCN_DL: V_XNOR_B32_e64 +define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + ret i64 %res +} + +; GCN-LABEL: name: uniform_xnor_i32 +; GCN: S_XNOR_B32 +define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_xnor_i32 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e32 +; GCN_DL: V_XNOR_B32_e64 +define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + ret i32 %res +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -163,8 +163,8 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5 +; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -472,10 +472,10 @@ ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/xnor.ll @@ -61,8 +61,8 @@ ; GCN-LABEL: {{^}}vector_xnor_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { entry: @@ -73,10 +73,10 @@ ; GCN-LABEL: {{^}}vector_xnor_i64_one_use ; GCN-NOT: s_xnor_b64 -; GCN: v_not_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { @@ -150,8 +150,8 @@ ; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) { entry: @@ -162,8 +162,8 @@ ; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) { entry: