Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -94,12 +94,15 @@ const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; + LiveIntervals *LIS; DenseMap Instructions; DenseMap Blocks; SmallVector ExecExports; SmallVector LiveMaskQueries; + void markInstruction(MachineInstr &MI, char Flag, + std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); void propagateInstruction(MachineInstr &MI, std::vector &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist); @@ -126,6 +129,7 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -135,8 +139,11 @@ char SIWholeQuadMode::ID = 0; -INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, - "SI Whole Quad Mode", false, false) +INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false, + false) char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; @@ -144,6 +151,23 @@ return new SIWholeQuadMode; } +void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, + std::vector &Worklist) { + InstrInfo &II = Instructions[&MI]; + + assert(Flag == StateWQM || Flag == StateExact); + + // Ignore if the instruction is already marked. The typical case is that we + // mark an instruction WQM multiple times, but for atomics it can happen that + // Flag is StateWQM, but Needs is already set to StateExact. In this case, + // letting the atomic run in StateExact is correct as per the relevant specs. + if (II.Needs) + return; + + II.Needs = Flag; + Worklist.push_back(&MI); +} + // Scan instructions to determine which ones require an Exact execmask and // which ones seed WQM requirements. char SIWholeQuadMode::scanInstructions(MachineFunction &MF, @@ -192,8 +216,7 @@ continue; } - Instructions[&MI].Needs = Flags; - Worklist.push_back(&MI); + markInstruction(MI, Flags, Worklist); GlobalFlags |= Flags; } @@ -249,32 +272,35 @@ if (!Use.isReg() || !Use.isUse()) continue; - // At this point, physical registers appear as inputs or outputs - // and following them makes no sense (and would in fact be incorrect - // when the same VGPR is used as both an output and an input that leads - // to a NeedsWQM instruction). - // - // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we - // have to trace this, in practice it happens for 64-bit computations like - // pointers where both dwords are followed already anyway. - if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) - continue; - - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) { - InstrInfo &DefII = Instructions[&DefMI]; + unsigned Reg = Use.getReg(); - // Obviously skip if DefMI is already flagged as NeedWQM. - // - // The instruction might also be flagged as NeedExact. This happens when - // the result of an atomic is used in a WQM computation. In this case, - // the atomic must not run for helper pixels and the WQM result is - // undefined. - if (DefII.Needs != 0) + // Handle physical registers that we need to track; this is mostly relevant + // for VCC, which can appear as the (implicit) input of a uniform branch, + // e.g. when a loop counter is stored in a VGPR. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Reg == AMDGPU::EXEC) continue; - DefII.Needs = StateWQM; - Worklist.push_back(&DefMI); + for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) + continue; + + // Since we're in machine SSA, we do not need to track physical + // registers across basic blocks. + if (Value->isPHIDef()) + continue; + + markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + Worklist); + } + + continue; } + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) + markInstruction(DefMI, StateWQM, Worklist); } } @@ -471,6 +497,7 @@ TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); + LIS = &getAnalysis(); char GlobalFlags = analyzeFunction(MF); if (!(GlobalFlags & StateWQM)) { Index: llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll +++ llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll @@ -6,7 +6,6 @@ ; GCN-LABEL: {{^}}test_branch: ; GCNNOOPT: v_writelane_b32 ; GCNNOOPT: v_writelane_b32 -; GCNNOOPT: v_writelane_b32 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] ; GCN: ; BB#1 Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll @@ -350,6 +350,44 @@ ret float %s } +; CHECK-LABEL: {{^}}test_loop_vcc: +; CHECK-NEXT: ; %entry +; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: s_and_b64 exec, exec, [[LIVE]] +; CHECK: image_store +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2 +; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]] + +; CHECK: [[LOOPHDR]]: ; %loop +; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]] +; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]] +; CHECK: s_cbranch_vccz +; CHECK: ; %break + +; CHECK: ; return +define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { +entry: + call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) + br label %loop + +loop: + %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ] + %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] + %cc = icmp sgt i32 %ctr.iv, 7 + br i1 %cc, label %break, label %body + +body: + %c.i = bitcast <4 x float> %c.iv to <4 x i32> + %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %ctr.next = add i32 %ctr.iv, 2 + br label %loop + +break: + ret <4 x float> %c.iv +} + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2