Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -118,6 +118,7 @@ DenseMap Instructions; DenseMap Blocks; std::map, LaneBitmask> WQMValues; + SmallVector PreferExact; SmallVector LiveMaskQueries; void printInfo(MachineFunction &MF) const; @@ -134,6 +135,7 @@ std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); + void propagate(std::vector &Worklist); char analyzeFunction(MachineFunction &MF); bool requiresCorrectState(const MachineInstr &MI) const; @@ -322,6 +324,7 @@ // in a quad, they just require all inputs of a quad to have been // computed for derivatives. markUsesWQM(MI, Worklist); + PreferExact.push_back(&MI); GlobalFlags |= StateWQM; } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { markInstruction(MI, StateExact, Worklist); @@ -346,6 +349,11 @@ } } } + + // Vector memory instructions prefer to be run in exact mode even when + // they're loads, to save a bit of memory bandwidth where possible. + if (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT) + PreferExact.push_back(&MI); } } @@ -476,10 +484,7 @@ Blocks[&MBB].Propagated = BI.Needs; } -char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { - std::vector Worklist; - char GlobalFlags = scanInstructions(MF, Worklist); - +void SIWholeQuadMode::propagate(std::vector &Worklist) { while (!Worklist.empty()) { WorkItem WI = Worklist.back(); Worklist.pop_back(); @@ -491,6 +496,22 @@ else propagateValue(WI.V, Worklist); } +} + +char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { + std::vector Worklist; + char GlobalFlags = scanInstructions(MF, Worklist); + + propagate(Worklist); + + for (MachineInstr *MI : PreferExact) { + if (!Instructions[MI].Needs && !(Instructions[MI].OutNeeds & StateWQM)) { + markInstruction(*MI, StateExact, Worklist); + GlobalFlags |= StateExact; + } + } + + propagate(Worklist); return GlobalFlags; } @@ -740,6 +761,7 @@ WQMValues.clear(); Blocks.clear(); LiveMaskQueries.clear(); + PreferExact.clear(); const SISubtarget &ST = MF.getSubtarget(); Index: test/CodeGen/AMDGPU/llvm.SI.image.sample.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.image.sample.ll +++ test/CodeGen/AMDGPU/llvm.SI.image.sample.ll @@ -6,7 +6,7 @@ ;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -20,7 +20,7 @@ ;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -76,7 +76,7 @@ ;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_b() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -90,7 +90,7 @@ ;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_b_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -146,7 +146,7 @@ ;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -160,7 +160,7 @@ ;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -216,7 +216,7 @@ ;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c_b() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -230,7 +230,7 @@ ;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c_b_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 Index: test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll +++ test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll @@ -6,7 +6,7 @@ ;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -20,7 +20,7 @@ ;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -76,7 +76,7 @@ ;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_b() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -90,7 +90,7 @@ ;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_b_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -146,7 +146,7 @@ ;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -160,7 +160,7 @@ ;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -216,7 +216,7 @@ ;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c_b() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 @@ -230,7 +230,7 @@ ;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @sample_c_b_cl() { main_body: - %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> , <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 %r1 = extractelement <4 x float> %r, i32 1 %r2 = extractelement <4 x float> %r, i32 2 Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -17,8 +17,10 @@ ; CHECK-LABEL: {{^}}test2: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]] +; TODO - figure out why MachineCopyPropagation doesn't eliminate the above ; CHECK-DAG: s_wqm_b64 exec, exec -; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] +; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[COPY]] ; CHECK: image_sample v0, [[VAR]], define amdgpu_ps float @test2() { %live = call i1 @llvm.amdgcn.ps.live() @@ -32,8 +34,9 @@ ; CHECK-LABEL: {{^}}test3: ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]] ; CHECK-DAG: s_wqm_b64 exec, exec -; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 +; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[COPY]], -1 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] ; CHECK: ; %dead define amdgpu_ps float @test3(i32 %in) { Index: test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- test/CodeGen/AMDGPU/si-scheduler.ll +++ test/CodeGen/AMDGPU/si-scheduler.ll @@ -8,9 +8,9 @@ ; The test checks the "si" machine scheduler pass works correctly. ; CHECK-LABEL: {{^}}main: -; CHECK: s_wqm ; CHECK: s_load_dwordx4 ; CHECK: s_load_dwordx8 +; CHECK: s_wqm ; CHECK: s_waitcnt lgkmcnt(0) ; CHECK: image_sample ; CHECK: s_waitcnt vmcnt(0) Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -12,11 +12,14 @@ ret <4 x float> %tex } -; Check that WQM is triggered by image samples and left untouched for loads... +; Check that WQM is triggered by image samples and then disabled again if the +; the rest of the shader doesn't care. ; ;CHECK-LABEL: {{^}}test2: ;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: image_sample ;CHECK-NOT: exec ;CHECK: _load_dword v0, @@ -30,7 +33,7 @@ ret float %data } -; ... but disabled for stores (and, in this simple case, not re-enabled). +; Check that WQM is disabled for stores (and, in this simple case, not re-enabled). ; ;CHECK-LABEL: {{^}}test3: ;CHECK-NEXT: ; %main_body @@ -366,6 +369,8 @@ ; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: _store ; CHECK: s_wqm_b64 exec, exec +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 1 +; CHECK: s_and_b64 exec, exec, [[ORIG]] ; CHECK: image_sample ; ; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires