Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -132,6 +132,32 @@ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; + // V_READFIRSTLANE/V_READLANE destination register may be used as operand + // by some SALU instruction. If exec mask is zero vector instruction + // defining the register that is used by the scalar one is not executed + // and scalar instruction will operate on undefined data. For + // V_READFIRSTLANE/V_READLANE we should iterate over its users and avoid + // predicated execution if one of the users is scalar. + if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) || + (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + const MachineOperand *Def = I->defs().begin(); + unsigned Reg = Def->getReg(); + MachineBasicBlock::const_iterator E = MBB.getFirstTerminator(); + MachineBasicBlock::const_iterator II(I); + while (++II != E) { + if (TII->isScalarUnit(*II)) { + for (auto U : II->uses()) { + if (U.isReg() && U.readsReg()) { + unsigned Use = U.getReg(); + if ((Use == Reg) || TRI->isSubRegister(Use, Reg)) + return true; + } + } + } + } + return false; + } + if (I->isInlineAsm()) { const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); const char *AsmStr = I->getOperand(0).getSymbolName(); Index: test/CodeGen/AMDGPU/readlane_exec0.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/readlane_exec0.mir @@ -0,0 +1,31 @@ +# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-skips | FileCheck %s + +# CHECK-LABEL: bb.0 +# CHECK: S_CBRANCH_EXECZ %bb.2 + +--- +name: readlane_exec0 + +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %vgpr1_vgpr2:0x00000001, %vgpr2_vgpr3:0x00000003 + + %vgpr4 = V_AND_B32_e32 1, %vgpr1, implicit %exec + V_CMP_EQ_U32_e32 1, killed %vgpr4, implicit-def %vcc, implicit %exec + %sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed %vcc, implicit-def %exec, implicit-def %scc, implicit %exec + SI_MASK_BRANCH %bb.2, implicit %exec + S_BRANCH %bb.1 + + bb.1: + + %sgpr10 = V_READFIRSTLANE_B32 %vgpr2, implicit %exec + %sgpr11 = V_READFIRSTLANE_B32 %vgpr3, implicit %exec + %sgpr10 = S_LOAD_DWORD_IMM killed %sgpr10_sgpr11, 0, 0 + S_WAITCNT 127 + %vgpr0 = V_XOR_B32_e32 killed %sgpr10, killed %vgpr0, implicit %exec + + bb.2: + + %exec = S_OR_B64 %exec, killed %sgpr0_sgpr1, implicit-def %scc +...