Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -154,6 +154,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; + bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; @@ -161,7 +162,7 @@ for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); - char Flags; + char Flags = 0; if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { Flags = StateWQM; @@ -175,15 +176,39 @@ ExecExports.push_back(&MI); } else if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); + } else if (WQMOutputs) { + // The function is in machine SSA form, which means that physical + // VGPRs correspond to shader inputs and outputs. Inputs are + // only used, outputs are only defined. + for (const MachineOperand &MO : MI.defs()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVirtualRegister(Reg) && + TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + Flags = StateWQM; + break; + } + } } - continue; + if (!Flags) + continue; } Instructions[&MI].Needs = Flags; Worklist.push_back(&MI); GlobalFlags |= Flags; } + + if (WQMOutputs && MBB.succ_empty()) { + // This is a prolog shader. Make sure we go back to exact mode at the end. + Blocks[&MBB].OutNeeds = StateExact; + Worklist.push_back(&MBB); + GlobalFlags |= StateExact; + } } return GlobalFlags; Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -332,6 +332,19 @@ ret <4 x float> %tex } +; Check prolog shaders. +; +; CHECK-LABEL: {{^}}test_prolog_1: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_add_f32_e32 v0, +; CHECK: s_and_b64 exec, exec, [[ORIG]] +define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 { +main_body: + %s = fadd float %a, %b + ret float %s +} + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 @@ -345,3 +358,4 @@ attributes #1 = { nounwind } attributes #2 = { nounwind readonly } attributes #3 = { nounwind readnone } +attributes #4 = { "amdgpu-ps-wqm-outputs" }