Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -69,6 +69,25 @@ StateExact = 0x2, }; +struct PrintState { +public: + explicit PrintState(int State) : State(State) {} + + int State; +}; + +static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { + if (PS.State & StateWQM) + OS << "WQM"; + if (PS.State & StateExact) { + if (PS.State & StateWQM) + OS << '|'; + OS << "Exact"; + } + + return OS; +} + struct InstrInfo { char Needs = 0; char OutNeeds = 0; @@ -98,11 +117,13 @@ DenseMap Instructions; DenseMap Blocks; - SmallVector ExecExports; SmallVector LiveMaskQueries; + void printInfo(); + void markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist); + void markUsesWQM(const MachineInstr &MI, std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); @@ -151,6 +172,24 @@ return new SIWholeQuadMode; } +void SIWholeQuadMode::printInfo() { + for (const auto &BII : Blocks) { + dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" + << " InNeeds = " << PrintState(BII.second.InNeeds) + << ", Needs = " << PrintState(BII.second.Needs) + << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; + + for (const MachineInstr &MI : *BII.first) { + auto III = Instructions.find(&MI); + if (III == Instructions.end()) + continue; + + dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs) + << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n'; + } + } +} + void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, std::vector &Worklist) { InstrInfo &II = Instructions[&MI]; @@ -168,6 +207,45 @@ Worklist.push_back(&MI); } +/// Mark all instructions defining the uses in \p MI as WQM. +void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, + std::vector &Worklist) { + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) + continue; + + unsigned Reg = Use.getReg(); + + // Handle physical registers that we need to track; this is mostly relevant + // for VCC, which can appear as the (implicit) input of a uniform branch, + // e.g. when a loop counter is stored in a VGPR. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Reg == AMDGPU::EXEC) + continue; + + for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) + continue; + + // Since we're in machine SSA, we do not need to track physical + // registers across basic blocks. + if (Value->isPHIDef()) + continue; + + markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + Worklist); + } + + continue; + } + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) + markInstruction(DefMI, StateWQM, Worklist); + } +} + // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, @@ -183,16 +261,19 @@ unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { + if (TII->isDS(Opcode)) { Flags = StateWQM; + } else if (TII->isWQM(Opcode)) { + // Sampling instructions don't need to produce results for all pixels + // in a quad, they just require all inputs of a quad to have been + // computed for derivatives. + markUsesWQM(MI, Worklist); + GlobalFlags |= StateWQM; + continue; } else if (TII->isDisableWQM(MI)) { Flags = StateExact; } else { - // Handle export instructions with the exec mask valid flag set - if (Opcode == AMDGPU::EXP) { - if (MI.getOperand(4).getImm() != 0) - ExecExports.push_back(&MI); - } else if (Opcode == AMDGPU::SI_PS_LIVE) { + if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical @@ -259,43 +340,9 @@ // Propagate WQM flag to instruction inputs assert(II.Needs != (StateWQM | StateExact)); - if (II.Needs != StateWQM) - return; - - for (const MachineOperand &Use : MI.uses()) { - if (!Use.isReg() || !Use.isUse()) - continue; - - unsigned Reg = Use.getReg(); - - // Handle physical registers that we need to track; this is mostly relevant - // for VCC, which can appear as the (implicit) input of a uniform branch, - // e.g. when a loop counter is stored in a VGPR. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - if (Reg == AMDGPU::EXEC) - continue; - for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { - LiveRange &LR = LIS->getRegUnit(*RegUnit); - const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); - if (!Value) - continue; - - // Since we're in machine SSA, we do not need to track physical - // registers across basic blocks. - if (Value->isPHIDef()) - continue; - - markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, - Worklist); - } - - continue; - } - - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); - } + if (II.Needs == StateWQM) + markUsesWQM(MI, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -395,9 +442,12 @@ if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) return; + DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); + unsigned SavedWQMReg = 0; bool WQMFromExec = isEntry; char State = isEntry ? StateExact : StateWQM; + MachineInstr *FirstNonWQM = nullptr; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); while (II != IE) { @@ -422,21 +472,24 @@ } } + DEBUG(dbgs() << " " << MI); + char Needs = 0; char OutNeeds = 0; auto InstrInfoIt = Instructions.find(&MI); if (InstrInfoIt != Instructions.end()) { Needs = InstrInfoIt->second.Needs; OutNeeds = InstrInfoIt->second.OutNeeds; - - // Make sure to switch to Exact mode before the end of the block when - // Exact and only Exact is needed further downstream. - if (OutNeeds == StateExact && MI.isTerminator()) { - assert(Needs == 0); - Needs = StateExact; - } } + // Keep track of the first consecutive non-WQM instruction, so that we + // switch away from WQM as soon as possible, potentially saving a small + // bit of bandwidth on loads. + if (Needs == StateWQM) + FirstNonWQM = nullptr; + else if (!FirstNonWQM) + FirstNonWQM = &MI; + // State switching if (Needs && State != Needs) { if (Needs == StateExact) { @@ -445,7 +498,7 @@ if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); - toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); + toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg); } else { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, &MI, SavedWQMReg); @@ -455,7 +508,7 @@ State = Needs; } - if (MI.getOpcode() == AMDGPU::SI_ELSE && State == StateExact) + if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) MI.getOperand(3).setImm(1); } @@ -463,7 +516,9 @@ assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, MBB.end(), SavedWQMReg); } else if (BI.OutNeeds == StateExact && State != StateExact) { - toExact(MBB, MBB.end(), 0, LiveMaskReg); + toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM) + : MBB.getFirstTerminator(), + 0, LiveMaskReg); } } @@ -483,7 +538,6 @@ Instructions.clear(); Blocks.clear(); - ExecExports.clear(); LiveMaskQueries.clear(); const SISubtarget &ST = MF.getSubtarget(); @@ -523,6 +577,8 @@ } } + DEBUG(printInfo()); + lowerLiveMaskQueries(LiveMaskReg); // Handle the general case Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -345,6 +345,7 @@ ; CHECK: v_cmpx_le_f32_e32 vcc, 0, ; CHECK: [[BB4]]: ; CHECK: s_or_b64 exec, exec +; CHECK: s_and_b64 exec, exec, ; CHECK: image_sample_c ; CHECK: v_cmp_neq_f32_e32 vcc, 0, Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -37,8 +37,8 @@ ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 @@ -63,7 +63,8 @@ ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: store ;CHECK: s_wqm_b64 exec, exec -;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf +;CHECK: image_sample +;CHECK: image_sample define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d @@ -71,7 +72,9 @@ call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %tex + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %dtex } ; Check a case of one branch of an if-else requiring WQM, the other requiring @@ -91,6 +94,7 @@ ;CHECK: s_mov_b64 exec, [[SAVED]] ;CHECK: %IF ;CHECK: image_sample +;CHECK: image_sample define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 @@ -98,7 +102,9 @@ IF: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %data.if = extractelement <4 x float> %tex, i32 0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %data.if = extractelement <4 x float> %dtex, i32 0 br label %END ELSE: @@ -118,6 +124,7 @@ ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: %IF ;CHECK: image_sample +;CHECK: image_sample ;CHECK: %Flow ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] @@ -137,7 +144,9 @@ IF: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %data.if = extractelement <4 x float> %tex, i32 0 + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %data.if = extractelement <4 x float> %dtex, i32 0 br label %END ELSE: @@ -203,35 +212,27 @@ ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] -;CHECK: store -;CHECK: load +;CHECK: image_sample ;CHECK: store ;CHECK: v_cmp -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = extractelement <4 x float> %tex, i32 0 - - %idx.1 = extractelement <3 x i32> %idx, i32 0 - %data.1 = extractelement <2 x float> %data, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) - - %idx.2 = extractelement <3 x i32> %idx, i32 1 - %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0) + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %dtex.1 = extractelement <4 x float> %dtex, i32 0 - %idx.3 = extractelement <3 x i32> %idx, i32 2 - %data.3 = extractelement <2 x float> %data, i32 1 - call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) - %cc = fcmp ogt float %z, 0.0 + %cc = fcmp ogt float %dtex.1, 0.0 br i1 %cc, label %IF, label %ELSE IF: - %tex.IF = fmul float %tex.1, 3.0 + %tex.IF = fmul float %dtex.1, 3.0 br label %END ELSE: - %tex.ELSE = fmul float %tex.1, 4.0 + %tex.ELSE = fmul float %dtex.1, 4.0 br label %END END: @@ -246,12 +247,13 @@ ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK: %IF -;CHECK: load ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] +;CHECK: load ;CHECK: store ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: %END ;CHECK: image_sample +;CHECK: image_sample define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 @@ -264,7 +266,9 @@ END: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - ret <4 x float> %tex + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %dtex } ; Kill is performed in WQM mode so that uniform kill behaves correctly ... @@ -273,8 +277,8 @@ ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec ;CHECK-NEXT: s_wqm_b64 exec, exec -;CHECK: image_sample ;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: image_sample ;CHECK: buffer_store_dword ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmpx_ @@ -297,7 +301,9 @@ call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %out = fadd <4 x float> %tex, %tex2 + %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %out = fadd <4 x float> %tex, %dtex ret <4 x float> %out } @@ -310,18 +316,21 @@ ; CHECK: s_wqm_b64 exec, exec ; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[ORIG]] +; CHECK: image_sample ; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) call void @llvm.AMDGPU.kill(float %z) - ret <4 x float> %tex + ret <4 x float> %dtex } ; Check prolog shaders. @@ -391,8 +400,8 @@ ; CHECK: s_wqm_b64 exec, exec ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen -; CHECK: image_sample ; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: image_sample ; CHECK: buffer_store_dwordx4 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { entry: