Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -132,6 +132,59 @@ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; + // V_READFIRSTLANE/V_READLANE destination register may be used as operand + // by some SALU instruction. If exec mask is zero vector instruction + // defining the register that is used by the scalar one is not executed + // and scalar instruction will operate on undefined data. For + // V_READFIRSTLANE/V_READLANE we should iterate over its users and avoid + // predicated execution if one of the users is scalar. + if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) || + (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + const MachineOperand *Def = I->defs().begin(); + unsigned Reg = Def->getReg(); + unsigned VReg = I->getOperand(1).getReg(); + + // We only should care about undefined data + // if V_READFIRSTLANE/V_READLANE source operand is re-defined + // in target block where exec mask is potentially zero. + // Otherwise it is defined in one of dominating blocks + // and V_READFIRSTLANE/V_READLANE will read correct value + // even exec mask is zero. + bool FoundDef = false; + MachineBasicBlock::const_iterator Seek = MBB.begin(); + MachineBasicBlock::const_iterator II(I); + while (Seek++ != II) + { + if (Seek->definesRegister(VReg)) + // TODO: What if we have superreg of VReg defined in masked block? + FoundDef = true; + } + + if (FoundDef) { + // iterate over all the instructions looking for the + // V_READFIRSTLANE/V_READLANE destination register users. + MachineFunction::const_iterator BB = MF->begin(); + MachineFunction::const_iterator BE = MF->end(); + for ( ; BB != BE; ++BB) + { + for (auto & BI : BB->instrs()) + { + if (TII->isScalarUnit(BI) || + (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + for (auto U : BI.uses()) { + if (U.isReg() && U.readsReg()) { + unsigned Use = U.getReg(); + if ((Use == Reg) || TRI->isSubRegister(Use, Reg)) + return true; + } + } + } + } + } + } + return false; + } + if (I->isInlineAsm()) { const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); const char *AsmStr = I->getOperand(0).getSymbolName(); Index: test/CodeGen/AMDGPU/readlane_exec0.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/readlane_exec0.mir @@ -0,0 +1,36 @@ +# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-skips -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +# GCN: bb.0 +# GCN: S_CBRANCH_EXECZ %bb.2 + +--- +name: readlane_exec0 + +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %vgpr1_vgpr2:0x00000001, %vgpr2_vgpr3:0x00000003 + + %vgpr4 = V_AND_B32_e32 1, %vgpr1, implicit %exec + V_CMP_EQ_U32_e32 1, killed %vgpr4, implicit-def %vcc, implicit %exec + %sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed %vcc, implicit-def %exec, implicit-def %scc, implicit %exec + SI_MASK_BRANCH %bb.2, implicit %exec + S_BRANCH %bb.1 + + bb.1: + + %vgpr12 = V_MOV_B32_e32 0, implicit %exec + %vgpr12_vgpr13 = V_LSHLREV_B64 2, %vgpr11_vgpr12, implicit %exec + %vgpr1 = V_ADD_I32_e32 %sgpr12, killed %vgpr12, implicit-def %vcc, implicit %exec + %vgpr5 = V_MOV_B32_e32 %sgpr13, implicit %exec, implicit %exec + %vgpr5 = V_ADDC_U32_e32 killed %vgpr5, killed %vgpr13, implicit-def dead %vcc, implicit killed %vcc, implicit %exec + %sgpr16 = V_READFIRSTLANE_B32 killed %vgpr1, implicit %exec + %sgpr17 = V_READFIRSTLANE_B32 killed %vgpr5, implicit %exec + %sgpr16 = S_LOAD_DWORD_IMM killed %sgpr16_sgpr17, 0, 0 + S_WAITCNT 127 + %vgpr23 = V_XOR_B32_e32 killed %sgpr16, killed %vgpr23, implicit %exec + + bb.2: + + %exec = S_OR_B64 %exec, killed %sgpr0_sgpr1, implicit-def %scc +...