Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -122,6 +122,7 @@ DenseMap Instructions; DenseMap Blocks; std::map, LaneBitmask> WQMValues; + SmallVector PreferExact; SmallVector LiveMaskQueries; // Tracking of feasible instruction insertion points. @@ -148,6 +149,7 @@ std::vector &Worklist); void propagateBlock(const MachineBasicBlock &MBB, std::vector &Worklist); + void propagate(std::vector &Worklist); char analyzeFunction(MachineFunction &MF); void resetInsertCandidates(bool PreferLate); @@ -325,6 +327,7 @@ // in a quad, they just require all inputs of a quad to have been // computed for derivatives. markUsesWQM(MI, Worklist); + PreferExact.push_back(&MI); GlobalFlags |= StateWQM; } else if (TII->get(Opcode).mayStore() && (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) { @@ -350,6 +353,11 @@ } } } + + // Vector memory instructions prefer to be run in exact mode even when + // they're loads, to save a bit of memory bandwidth where possible. + if (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT) + PreferExact.push_back(&MI); } } @@ -480,10 +488,7 @@ Blocks[&MBB].Propagated = BI.Needs; } -char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { - std::vector Worklist; - char GlobalFlags = scanInstructions(MF, Worklist); - +void SIWholeQuadMode::propagate(std::vector &Worklist) { while (!Worklist.empty()) { WorkItem WI = Worklist.back(); Worklist.pop_back(); @@ -495,6 +500,22 @@ else propagateValue(WI.V, Worklist); } +} + +char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { + std::vector Worklist; + char GlobalFlags = scanInstructions(MF, Worklist); + + propagate(Worklist); + + for (const MachineInstr *MI : PreferExact) { + if (!Instructions[MI].Needs && !(Instructions[MI].OutNeeds & StateWQM)) { + markInstruction(*MI, StateExact, Worklist); + GlobalFlags |= StateExact; + } + } + + propagate(Worklist); return GlobalFlags; } @@ -771,6 +792,7 @@ WQMValues.clear(); Blocks.clear(); LiveMaskQueries.clear(); + PreferExact.clear(); const SISubtarget &ST = MF.getSubtarget(); Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -12,11 +12,14 @@ ret <4 x float> %tex } -; Check that WQM is triggered by image samples and left untouched for loads... +; Check that WQM is triggered by image samples and then disabled again if the +; the rest of the shader doesn't care. ; ;CHECK-LABEL: {{^}}test2: ;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample ;CHECK-NOT: exec ;CHECK: _load_dword v0, @@ -30,7 +33,7 @@ ret float %data } -; ... but disabled for stores (and, in this simple case, not re-enabled). +; Check that WQM is disabled for stores (and, in this simple case, not re-enabled). ; ;CHECK-LABEL: {{^}}test3: ;CHECK-NEXT: ; %main_body @@ -366,6 +369,8 @@ ; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: _store ; CHECK: s_wqm_b64 exec, exec +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 1 +; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: image_sample ; ; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires