Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -133,28 +133,10 @@ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; - // V_READFIRSTLANE/V_READLANE destination register may be used as operand - // by some SALU instruction. If exec mask is zero vector instruction - // defining the register that is used by the scalar one is not executed - // and scalar instruction will operate on undefined data. For - // V_READFIRSTLANE/V_READLANE we should avoid predicated execution. - if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) || - (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) return true; - } - - if (I->isInlineAsm()) { - const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const char *AsmStr = I->getOperand(0).getSymbolName(); - - // inlineasm length estimate is number of bytes assuming the longest - // instruction. - uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); - NumInstr += MaxAsmSize / MAI->getMaxInstLength(); - } else { - ++NumInstr; - } + ++NumInstr; if (NumInstr >= SkipThreshold) return true; } Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -591,6 +591,9 @@ return !RI.isSGPRReg(MRI, Dest); } + /// Whether we must prevent this instruction from executing with EXEC = 0. + bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const; + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2295,6 +2295,36 @@ changesVGPRIndexingMode(MI); } +bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + + if (MI.mayStore() && isSMRD(MI)) + return true; // scalar store or atomic + + // These instructions cause shader I/O that may cause hardware lockups + // when executed with an empty EXEC mask. + // + // Note: exp with VM = DONE = 0 is automatically skipped by hardware when + // EXEC = 0, but checking for that case here seems not worth it + // given the typical code patterns. + if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || + Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE) + return true; + + if (MI.isInlineAsm()) + return true; // conservative assumption + + // These are like SALU instructions in terms of effects, so it's questionable + // whether we should return true for those. + // + // However, executing them with EXEC = 0 causes them to operate on undefined + // data, which we avoid by returning true here. + if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32) + return true; + + return false; +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { switch (Imm.getBitWidth()) { case 32: Index: test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -480,5 +480,65 @@ ret void } +; GCN-LABEL: {{^}}test_if_export_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) + br label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}test_if_export_vm_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 true) + br label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}test_if_export_done_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 false) + br label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}test_if_export_vm_done_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + br label %end + +end: + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind inaccessiblememonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll @@ -136,6 +136,21 @@ ret void } +; GCN-LABEL: {{^}}if_sendmsg: +; GCN: s_cbranch_execz +; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define amdgpu_gs void @if_sendmsg(i32 %flag) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %sendmsg, label %end + +sendmsg: + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) + br label %end + +end: + ret void +} + declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -72,10 +72,18 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_cbranch_execnz BB6_2 ; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: exp +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB6_2: ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_cbranch_execnz BB6_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: exp +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB6_4: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { call void @llvm.AMDGPU.kill(float %x)