Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -100,9 +100,11 @@ DenseMap Instructions; DenseMap Blocks; - SmallVector ExecExports; SmallVector LiveMaskQueries; + void printInfo(); + + void markUsesWQM(const MachineInstr &MI, std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector& Worklist); void propagateInstruction(const MachineInstr &MI, std::vector& Worklist); void propagateBlock(const MachineBasicBlock &MBB, std::vector& Worklist); @@ -149,6 +151,76 @@ return new SIWholeQuadMode; } +static std::string stateString(int state) { + std::string Str; + + if (state & StateWQM) + Str = "WQM"; + if (state & StateExact) { + if (!Str.empty()) + Str += "|"; + Str += "Exact"; + } + + return Str; +} + +void SIWholeQuadMode::printInfo() { + for (const auto &BII : Blocks) { + dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"; + dbgs() << " InNeeds = " << stateString(BII.second.InNeeds) + << ", Needs = " << stateString(BII.second.Needs) + << ", OutNeeds = " << stateString(BII.second.OutNeeds) << "\n\n"; + + for (const MachineInstr &MI : *BII.first) { + auto III = Instructions.find(&MI); + if (III == Instructions.end()) + continue; + + dbgs() << " " << MI; + dbgs() << " Needs = " << stateString(III->second.Needs) + << ", OutNeeds = " << stateString(III->second.OutNeeds) << "\n"; + } + } +} + +/// Mark all instructions defining the uses in \p MI as WQM. +void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, + std::vector &Worklist) { + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) + continue; + + // At this point, physical registers appear as (shader) inputs or + // non-monolithic shader outputs. Following those makes no sense (and would + // in fact be incorrect when the same VGPR is used as both an output and an + // input that leads to a NeedsWQM instruction). + // + // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we + // have to trace this, in practice it happens for 64-bit computations like + // pointers where both dwords are followed already anyway. + if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) + continue; + + for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) { + const MachineInstr *DefMI = Def.getParent(); + InstrInfo &DefII = Instructions[DefMI]; + + // Obviously skip if DefMI is already flagged as NeedWQM. + // + // The instruction might also be flagged as NeedExact. This happens when + // the result of an atomic is used in a WQM computation. In this case, + // the atomic must not run for helper pixels and the WQM result is + // undefined. + if (DefII.Needs != 0) + continue; + + DefII.Needs = StateWQM; + Worklist.push_back(DefMI); + } + } +} + // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, @@ -164,17 +236,19 @@ unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { + if (TII->isDS(Opcode)) { Flags = StateWQM; + } else if (TII->isWQM(Opcode)) { + // Sampling instructions don't need to produce results for all pixels + // in a quad, they just require all inputs of a quad to have been + // computed for derivatives. + markUsesWQM(MI, Worklist); + GlobalFlags |= StateWQM; } else if (TII->get(Opcode).mayStore() && (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) { Flags = StateExact; } else { - // Handle export instructions with the exec mask valid flag set - if (Opcode == AMDGPU::EXP) { - if (MI.getOperand(4).getImm() != 0) - ExecExports.push_back(&MI); - } else if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical @@ -249,41 +323,8 @@ // Propagate WQM flag to instruction inputs assert(II.Needs != (StateWQM | StateExact)); - if (II.Needs != StateWQM) - return; - - for (const MachineOperand &Use : MI.uses()) { - if (!Use.isReg() || !Use.isUse()) - continue; - - // At this point, physical registers appear as inputs or outputs - // and following them makes no sense (and would in fact be incorrect - // when the same VGPR is used as both an output and an input that leads - // to a NeedsWQM instruction). - // - // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we - // have to trace this, in practice it happens for 64-bit computations like - // pointers where both dwords are followed already anyway. - if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) - continue; - - for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) { - const MachineInstr *DefMI = Def.getParent(); - InstrInfo &DefII = Instructions[DefMI]; - - // Obviously skip if DefMI is already flagged as NeedWQM. - // - // The instruction might also be flagged as NeedExact. This happens when - // the result of an atomic is used in a WQM computation. In this case, - // the atomic must not run for helper pixels and the WQM result is - // undefined. - if (DefII.Needs != 0) - continue; - - DefII.Needs = StateWQM; - Worklist.push_back(DefMI); - } - } + if (II.Needs == StateWQM) + markUsesWQM(MI, Worklist); } void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB, @@ -384,9 +425,12 @@ if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) return; + DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); + unsigned SavedWQMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; + MachineInstr *FirstNonWQM = nullptr; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); while (II != IE) { @@ -412,21 +456,24 @@ } } + DEBUG(dbgs() << " " << MI); + char Needs = 0; char OutNeeds = 0; auto InstrInfoIt = Instructions.find(&MI); if (InstrInfoIt != Instructions.end()) { Needs = InstrInfoIt->second.Needs; OutNeeds = InstrInfoIt->second.OutNeeds; - - // Make sure to switch to Exact mode before the end of the block when - // Exact and only Exact is needed further downstream. - if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) { - assert(Needs == 0); - Needs = StateExact; - } } + // Keep track of the first consecutive non-WQM instruction, so that we + // switch away from WQM as soon as possible, potentially saving a small + // bit of bandwidth on loads. + if (Needs == StateWQM) + FirstNonWQM = nullptr; + else if (!FirstNonWQM) + FirstNonWQM = &MI; + // State switching if (Needs && State != Needs) { if (Needs == StateExact) { @@ -435,7 +482,7 @@ if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); + toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg); } else { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, &MI, SavedWQMReg); @@ -453,7 +500,9 @@ assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, MBB.end(), SavedWQMReg); } else if (BI.OutNeeds == StateExact && State != StateExact) { - toExact(MBB, MBB.end(), 0, LiveMaskReg); + toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM) + : MBB.getFirstTerminator(), + 0, LiveMaskReg); } } @@ -473,7 +522,6 @@ Instructions.clear(); Blocks.clear(); - ExecExports.clear(); LiveMaskQueries.clear(); const SISubtarget &ST = MF.getSubtarget(); @@ -509,6 +557,8 @@ return true; } + DEBUG(printInfo()); + lowerLiveMaskQueries(LiveMaskReg); EntryMI = nullptr; Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -36,8 +36,8 @@ ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 @@ -62,14 +62,17 @@ ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: store ;CHECK: s_wqm_b64 exec, exec -;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf +;CHECK: image_sample +;CHECK: image_sample define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 store float %data, float addrspace(1)* %gep %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %tex + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %dtex } ; Check a case of one branch of an if-else requiring WQM, the other requiring @@ -89,6 +92,7 @@ ;CHECK: s_mov_b64 exec, [[SAVED]] ;CHECK: %IF ;CHECK: image_sample +;CHECK: image_sample define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 @@ -96,7 +100,9 @@ IF: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %data.if = extractelement <4 x float> %tex, i32 0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %data.if = extractelement <4 x float> %dtex, i32 0 br label %END ELSE: @@ -117,6 +123,7 @@ ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: %IF ;CHECK: image_sample +;CHECK: image_sample ;CHECK: %Flow ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] @@ -136,7 +143,9 @@ IF: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %data.if = extractelement <4 x float> %tex, i32 0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %data.if = extractelement <4 x float> %dtex, i32 0 br label %END ELSE: @@ -206,38 +215,29 @@ ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: store -;CHECK: load +;CHECK: image_sample ;CHECK: store ;CHECK: v_cmp -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, i32 %coord) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = extractelement <4 x float> %tex, i32 0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %dtex.1 = extractelement <4 x float> %dtex, i32 0 %idx.1 = extractelement <3 x i32> %idx, i32 0 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 - %data.1 = extractelement <2 x float> %data, i32 0 - store float %data.1, float addrspace(1)* %gep.1 - - %idx.2 = extractelement <3 x i32> %idx, i32 1 - %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 - %z = load float, float addrspace(1)* %gep.2 + store float %dtex.1, float addrspace(1)* %gep.1 - %idx.3 = extractelement <3 x i32> %idx, i32 2 - %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 - %data.3 = extractelement <2 x float> %data, i32 1 - store float %data.3, float addrspace(1)* %gep.3 - - %cc = fcmp ogt float %z, 0.0 + %cc = fcmp ogt float %dtex.1, 0.0 br i1 %cc, label %IF, label %ELSE IF: - %tex.IF = fmul float %tex.1, 3.0 + %tex.IF = fmul float %dtex.1, 3.0 br label %END ELSE: - %tex.ELSE = fmul float %tex.1, 4.0 + %tex.ELSE = fmul float %dtex.1, 4.0 br label %END END: @@ -252,12 +252,13 @@ ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: %IF -;CHECK: load ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] +;CHECK: load ;CHECK: store ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: %END ;CHECK: image_sample +;CHECK: image_sample define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 @@ -271,7 +272,9 @@ END: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %tex + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %dtex } ; Kill is performed in WQM mode so that uniform kill behaves correctly ... @@ -280,8 +283,8 @@ ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample ;SI: buffer_store_dword ;VI: flat_store_dword ;CHECK: s_wqm_b64 exec, exec @@ -308,7 +311,9 @@ store float %data.1, float addrspace(1)* %gep.1 %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %out = fadd <4 x float> %tex, %tex2 + %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %out = fadd <4 x float> %tex, %dtex ret <4 x float> %out } @@ -321,6 +326,7 @@ ; CHECK: s_wqm_b64 exec, exec ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: image_sample ; SI: buffer_store_dword ; VI: flat_store_dword ; CHECK-NOT: wqm @@ -328,13 +334,15 @@ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx store float %data, float addrspace(1)* %gep call void @llvm.AMDGPU.kill(float %z) - ret <4 x float> %tex + ret <4 x float> %dtex } ; Check prolog shaders.