diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1598,8 +1598,13 @@ // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels -// enabled, with a few exceptions: - Phi nodes with require WWM return an +// enabled, with a few exceptions: - Phi nodes which require WWM return an // undefined value. +def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], + [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, + IntrConvergent, IntrWillReturn] +>; +// Deprecated. Use int_amdgcn_strict_wwm instead. def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrConvergent, IntrWillReturn] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -517,7 +517,7 @@ } // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -621,7 +621,8 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan); + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); } else { switch (Op) { default: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2623,7 +2623,8 @@ Opcode = AMDGPU::SOFT_WQM; break; case Intrinsic::amdgcn_wwm: - Opcode = AMDGPU::WWM; + case Intrinsic::amdgcn_strict_wwm: + Opcode = AMDGPU::STRICT_WWM; break; case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -925,8 +925,9 @@ return constrainCopyLikeIntrin(I, AMDGPU::WQM); case Intrinsic::amdgcn_softwqm: return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM); + case Intrinsic::amdgcn_strict_wwm: case Intrinsic::amdgcn_wwm: - return constrainCopyLikeIntrin(I, AMDGPU::WWM); + return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM); case Intrinsic::amdgcn_writelane: return selectWritelane(I); case Intrinsic::amdgcn_div_scale: diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3938,6 +3938,7 @@ case Intrinsic::amdgcn_update_dpp: case Intrinsic::amdgcn_mov_dpp8: case Intrinsic::amdgcn_mov_dpp: + case Intrinsic::amdgcn_strict_wwm: case Intrinsic::amdgcn_wwm: case Intrinsic::amdgcn_wqm: case Intrinsic::amdgcn_softwqm: diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -582,7 +582,7 @@ case AMDGPU::COPY: case AMDGPU::WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::WWM: { + case AMDGPU::STRICT_WWM: { Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1861,16 +1861,16 @@ MI.eraseFromParent(); break; } - case AMDGPU::ENTER_WWM: { + case AMDGPU::ENTER_STRICT_WWM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when - // WWM is entered. + // Whole Wave Mode is entered. MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64)); break; } - case AMDGPU::EXIT_WWM: { + case AMDGPU::EXIT_STRICT_WWM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when - // WWM is exited. + // Whole Wave Mode is exited. MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } @@ -4220,7 +4220,7 @@ case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; case AMDGPU::WQM: return AMDGPU::WQM; case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; - case AMDGPU::WWM: return AMDGPU::WWM; + case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; case AMDGPU::S_MOV_B32: { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); return MI.getOperand(1).isReg() || @@ -6356,7 +6356,7 @@ case AMDGPU::COPY: case AMDGPU::WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::WWM: + case AMDGPU::STRICT_WWM: case AMDGPU::REG_SEQUENCE: case AMDGPU::PHI: case AMDGPU::INSERT_SUBREG: @@ -6514,7 +6514,7 @@ case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: case AMDGPU::SOFT_WQM: - case AMDGPU::WWM: { + case AMDGPU::STRICT_WWM: { const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1); if (RI.hasAGPRs(SrcRC)) { if (RI.hasAGPRs(NewDstRC)) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -119,17 +119,17 @@ // turned into a copy by WQM pass, but does not seed WQM requirements. def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; -// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so +// Pseudoinstruction for @llvm.amdgcn.strictwwm. It is turned into a copy post-RA, so // that the @earlyclobber is respected. The @earlyclobber is to make sure that -// the instruction that defines $src0 (which is run in WWM) doesn't +// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't // accidentally clobber inactive channels of $vdst. let Constraints = "@earlyclobber $vdst" in { -def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { +def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let Uses = [EXEC]; let Defs = [EXEC, SCC]; let hasSideEffects = 0; @@ -137,7 +137,7 @@ let mayStore = 0; } -def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { +def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let hasSideEffects = 0; let mayLoad = 0; let mayStore = 0; diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp --- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp @@ -185,13 +185,13 @@ MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64) RegsAssigned |= processDef(MI.getOperand(0)); - if (MI.getOpcode() == AMDGPU::ENTER_WWM) { + if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM) { LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n"); InWWM = true; continue; } - if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM) { LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n"); InWWM = false; } diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -27,7 +27,7 @@ /// S_MOV_B64 EXEC, Tmp /// /// We also compute when a sequence of instructions requires Whole Wavefront -/// Mode (WWM) and insert instructions to save and restore it: +/// Mode (StrictWWM) and insert instructions to save and restore it: /// /// S_OR_SAVEEXEC_B64 Tmp, -1 /// ... @@ -76,7 +76,7 @@ enum { StateWQM = 0x1, - StateWWM = 0x2, + StateStrictWWM = 0x2, StateExact = 0x4, }; @@ -91,13 +91,13 @@ static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { if (PS.State & StateWQM) OS << "WQM"; - if (PS.State & StateWWM) { + if (PS.State & StateStrictWWM) { if (PS.State & StateWQM) OS << '|'; - OS << "WWM"; + OS << "StrictWWM"; } if (PS.State & StateExact) { - if (PS.State & (StateWQM | StateWWM)) + if (PS.State & (StateWQM | StateStrictWWM)) OS << '|'; OS << "Exact"; } @@ -151,7 +151,7 @@ DenseMap Instructions; MapVector Blocks; - // Tracks state (WQM/WWM/Exact) after a given instruction + // Tracks state (WQM/StrictWWM/Exact) after a given instruction DenseMap StateTransition; SmallVector LiveMaskQueries; @@ -182,10 +182,10 @@ Register SaveWQM); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SavedWQM); - void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - Register SaveOrig); - void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, - Register SavedOrig, char NonWWMState); + void toStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + Register SaveOrig); + void fromStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + Register SavedOrig, char NonStrictWWMState); MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI); @@ -422,17 +422,17 @@ LowerToCopyInstrs.push_back(&MI); SoftWQMInstrs.push_back(&MI); continue; - } else if (Opcode == AMDGPU::WWM) { - // The WWM intrinsic doesn't make the same guarantee, and plus it needs - // to be executed in WQM or Exact so that its copy doesn't clobber - // inactive lanes. - markInstructionUses(MI, StateWWM, Worklist); - GlobalFlags |= StateWWM; + } else if (Opcode == AMDGPU::STRICT_WWM) { + // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus + // it needs to be executed in WQM or Exact so that its copy doesn't + // clobber inactive lanes. + markInstructionUses(MI, StateStrictWWM, Worklist); + GlobalFlags |= StateStrictWWM; LowerToMovInstrs.push_back(&MI); continue; } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || Opcode == AMDGPU::V_SET_INACTIVE_B64) { - III.Disabled = StateWWM; + III.Disabled = StateStrictWWM; MachineOperand &Inactive = MI.getOperand(2); if (Inactive.isReg()) { if (Inactive.isUndef()) { @@ -441,7 +441,7 @@ Register Reg = Inactive.getReg(); if (Reg.isVirtual()) { for (MachineInstr &DefMI : MRI->def_instructions(Reg)) - markInstruction(DefMI, StateWWM, Worklist); + markInstruction(DefMI, StateStrictWWM, Worklist); } } } @@ -454,7 +454,7 @@ Worklist.push_back(&MBB); } GlobalFlags |= StateExact; - III.Disabled = StateWQM | StateWWM; + III.Disabled = StateWQM | StateStrictWWM; continue; } else { if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) { @@ -531,7 +531,7 @@ // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; + char InNeeds = (II.Needs & ~StateStrictWWM) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -547,10 +547,10 @@ if (II.Needs != 0) markInstructionUses(MI, II.Needs, Worklist); - // Ensure we process a block containing WWM, even if it does not require any - // WQM transitions. - if (II.Needs & StateWWM) - BI.Needs |= StateWWM; + // Ensure we process a block containing StrictWWM, even if it does not require + // any WQM transitions. + if (II.Needs & StateStrictWWM) + BI.Needs |= StateStrictWWM; } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -908,7 +908,7 @@ // Replace (or supplement) instructions accessing live mask. // This can only happen once all the live mask registers have been created -// and the execute state (WQM/WWM/Exact) of instructions is known. +// and the execute state (WQM/StrictWWM/Exact) of instructions is known. void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) { auto BII = Blocks.find(&MBB); if (BII == Blocks.end()) @@ -1066,28 +1066,30 @@ StateTransition[MI] = StateWQM; } -void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - Register SaveOrig) { +void SIWholeQuadMode::toStrictWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + Register SaveOrig) { MachineInstr *MI; assert(SaveOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM), + SaveOrig) .addImm(-1); LIS->InsertMachineInstrInMaps(*MI); - StateTransition[MI] = StateWWM; + StateTransition[MI] = StateStrictWWM; } -void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, - MachineBasicBlock::iterator Before, - Register SavedOrig, char NonWWMState) { +void SIWholeQuadMode::fromStrictWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + Register SavedOrig, + char NonStrictWWMState) { MachineInstr *MI; assert(SavedOrig); - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec) + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), Exec) .addReg(SavedOrig); LIS->InsertMachineInstrInMaps(*MI); - StateTransition[MI] = NonWWMState; + StateTransition[MI] = NonStrictWWMState; } void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { @@ -1108,10 +1110,10 @@ << ":\n"); Register SavedWQMReg; - Register SavedNonWWMReg; + Register SavedNonStrictWWMReg; bool WQMFromExec = IsEntry; char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; - char NonWWMState = 0; + char NonStrictWWMState = 0; const TargetRegisterClass *BoolRC = TRI->getBoolRC(); auto II = MBB.getFirstNonPHI(), IE = MBB.end(); @@ -1125,25 +1127,25 @@ // Exact or vice versa. MachineBasicBlock::iterator FirstWQM = IE; - // This stores the first instruction where it's safe to switch from WWM to - // Exact/WQM or to switch to WWM. It must always be the same as, or after, - // FirstWQM since if it's safe to switch to/from WWM, it must be safe to - // switch to/from WQM as well. - MachineBasicBlock::iterator FirstWWM = IE; + // This stores the first instruction where it's safe to switch from StrictWWM + // to Exact/WQM or to switch to StrictWWM. It must always be the same as, or + // after, FirstWQM since if it's safe to switch to/from StrictWWM, it must be + // safe to switch to/from WQM as well. + MachineBasicBlock::iterator FirstStrictWWM = IE; // Record initial state is block information. BI.InitialState = State; for (;;) { MachineBasicBlock::iterator Next = II; - char Needs = StateExact | StateWQM; // WWM is disabled by default + char Needs = StateExact | StateWQM; // StrictWWM is disabled by default char OutNeeds = 0; if (FirstWQM == IE) FirstWQM = II; - if (FirstWWM == IE) - FirstWWM = II; + if (FirstStrictWWM == IE) + FirstStrictWWM = II; // First, figure out the allowed states (Needs) based on the propagated // flags. @@ -1153,8 +1155,8 @@ if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { - if (III->second.Needs & StateWWM) - Needs = StateWWM; + if (III->second.Needs & StateStrictWWM) + Needs = StateStrictWWM; else if (III->second.Needs & StateWQM) Needs = StateWQM; else @@ -1163,8 +1165,8 @@ } } else { // If the instruction doesn't actually need a correct EXEC, then we can - // safely leave WWM enabled. - Needs = StateExact | StateWQM | StateWWM; + // safely leave StrictWWM enabled. + Needs = StateExact | StateWQM | StateStrictWWM; } if (MI.isTerminator() && OutNeeds == StateExact) @@ -1184,9 +1186,9 @@ // Now, transition if necessary. if (!(Needs & State)) { MachineBasicBlock::iterator First; - if (State == StateWWM || Needs == StateWWM) { - // We must switch to or from WWM - First = FirstWWM; + if (State == StateStrictWWM || Needs == StateStrictWWM) { + // We must switch to or from StrictWWM + First = FirstStrictWWM; } else { // We only need to switch to/from WQM, so we can use FirstWQM First = FirstWQM; @@ -1196,11 +1198,12 @@ bool SaveSCC = false; switch (State) { case StateExact: - case StateWWM: + case StateStrictWWM: // Exact/WWM -> WWM: save SCC // Exact/WWM -> WQM: save SCC if WQM mask is generated from exec // Exact/WWM -> Exact: no save - SaveSCC = (Needs & StateWWM) || ((Needs & StateWQM) && WQMFromExec); + SaveSCC = + (Needs & StateStrictWWM) || ((Needs & StateWQM) && WQMFromExec); break; case StateWQM: // WQM -> Exact/WMM: save SCC @@ -1213,20 +1216,20 @@ MachineBasicBlock::iterator Before = prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC); - if (State == StateWWM) { - assert(SavedNonWWMReg); - fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState); - LIS->createAndComputeVirtRegInterval(SavedNonWWMReg); - SavedNonWWMReg = 0; - State = NonWWMState; + if (State == StateStrictWWM) { + assert(SavedNonStrictWWMReg); + fromStrictWWM(MBB, Before, SavedNonStrictWWMReg, NonStrictWWMState); + LIS->createAndComputeVirtRegInterval(SavedNonStrictWWMReg); + SavedNonStrictWWMReg = 0; + State = NonStrictWWMState; } - if (Needs == StateWWM) { - NonWWMState = State; - assert(!SavedNonWWMReg); - SavedNonWWMReg = MRI->createVirtualRegister(BoolRC); - toWWM(MBB, Before, SavedNonWWMReg); - State = StateWWM; + if (Needs == StateStrictWWM) { + NonStrictWWMState = State; + assert(!SavedNonStrictWWMReg); + SavedNonStrictWWMReg = MRI->createVirtualRegister(BoolRC); + toStrictWWM(MBB, Before, SavedNonStrictWWMReg); + State = StateStrictWWM; } else { if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) { @@ -1248,17 +1251,18 @@ } State = StateWQM; } else { - // We can get here if we transitioned from WWM to a non-WWM state that - // already matches our needs, but we shouldn't need to do anything. + // We can get here if we transitioned from StrictWWM to a + // non-StrictWWM state that already matches our needs, but we + // shouldn't need to do anything. assert(Needs & State); } } } - if (Needs != (StateExact | StateWQM | StateWWM)) { + if (Needs != (StateExact | StateWQM | StateStrictWWM)) { if (Needs != (StateExact | StateWQM)) FirstWQM = IE; - FirstWWM = IE; + FirstStrictWWM = IE; } if (II == IE) @@ -1267,7 +1271,7 @@ II = Next; } assert(!SavedWQMReg); - assert(!SavedNonWWMReg); + assert(!SavedNonStrictWWMReg); } void SIWholeQuadMode::lowerLiveMaskQueries() { @@ -1399,9 +1403,10 @@ LiveMaskReg = Exec; - // Shader is simple does not need WQM/WWM or any complex lowering - if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() && - LowerToMovInstrs.empty() && KillInstrs.empty()) { + // Shader is simple does not need WQM/StrictWWM or any complex lowering + if (!(GlobalFlags & (StateWQM | StateStrictWWM)) && + LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() && + KillInstrs.empty()) { lowerLiveMaskQueries(); return !LiveMaskQueries.empty(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wwm.ll @@ -1,13 +1,94 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +define amdgpu_ps float @strict_wwm_f32(float %val) { + ; GCN-LABEL: name: strict_wwm_f32 + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec + ; GCN: $vgpr0 = COPY [[STRICT_WWM]] + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ret = call float @llvm.amdgcn.strict.wwm.f32(float %val) + ret float %ret +} + +define amdgpu_ps float @strict_wwm_v2f16(float %arg) { + ; GCN-LABEL: name: strict_wwm_v2f16 + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec + ; GCN: $vgpr0 = COPY [[STRICT_WWM]] + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 + %val = bitcast float %arg to <2 x half> + %ret = call <2 x half> @llvm.amdgcn.strict.wwm.v2f16(<2 x half> %val) + %bc = bitcast <2 x half> %ret to float + ret float %bc +} + +define amdgpu_ps <2 x float> @strict_wwm_f64(double %val) { + ; GCN-LABEL: name: strict_wwm_f64 + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_64 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0 + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1 + ; GCN: $vgpr0 = COPY [[COPY2]] + ; GCN: $vgpr1 = COPY [[COPY3]] + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 + %ret = call double @llvm.amdgcn.strict.wwm.f64(double %val) + %bitcast = bitcast double %ret to <2 x float> + ret <2 x float> %bitcast +} + +; TODO +; define amdgpu_ps float @strict_wwm_i1_vcc(float %val) { +; %vcc = fcmp oeq float %val, 0.0 +; %ret = call i1 @llvm.amdgcn.strict.wwm.i1(i1 %vcc) +; %select = select i1 %ret, float 1.0, float 0.0 +; ret float %select +; } + +define amdgpu_ps <3 x float> @strict_wwm_v3f32(<3 x float> %val) { + ; GCN-LABEL: name: strict_wwm_v3f32 + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 + ; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_96 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0 + ; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1 + ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub2 + ; GCN: $vgpr0 = COPY [[COPY3]] + ; GCN: $vgpr1 = COPY [[COPY4]] + ; GCN: $vgpr2 = COPY [[COPY5]] + ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %ret = call <3 x float> @llvm.amdgcn.strict.wwm.v3f32(<3 x float> %val) + ret <3 x float> %ret +} + +declare i1 @llvm.amdgcn.strict.wwm.i1(i1) #0 +declare float @llvm.amdgcn.strict.wwm.f32(float) #0 +declare <2 x half> @llvm.amdgcn.strict.wwm.v2f16(<2 x half>) #0 +declare <3 x float> @llvm.amdgcn.strict.wwm.v3f32(<3 x float>) #0 +declare double @llvm.amdgcn.strict.wwm.f64(double) #0 + + +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. + define amdgpu_ps float @wwm_f32(float %val) { ; GCN-LABEL: name: wwm_f32 ; GCN: bb.1 (%ir-block.0): ; GCN: liveins: $vgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: [[WWM:%[0-9]+]]:vgpr_32 = WWM [[COPY]], implicit $exec - ; GCN: $vgpr0 = COPY [[WWM]] + ; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec + ; GCN: $vgpr0 = COPY [[STRICT_WWM]] ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.wwm.f32(float %val) ret float %ret @@ -18,8 +99,8 @@ ; GCN: bb.1 (%ir-block.0): ; GCN: liveins: $vgpr0 ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN: [[WWM:%[0-9]+]]:vgpr_32 = WWM [[COPY]], implicit $exec - ; GCN: $vgpr0 = COPY [[WWM]] + ; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec + ; GCN: $vgpr0 = COPY [[STRICT_WWM]] ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = bitcast float %arg to <2 x half> %ret = call <2 x half> @llvm.amdgcn.wwm.v2f16(<2 x half> %val) @@ -34,9 +115,9 @@ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 - ; GCN: [[WWM:%[0-9]+]]:vreg_64 = WWM [[REG_SEQUENCE]], implicit $exec - ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub0 - ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub1 + ; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_64 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0 + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1 ; GCN: $vgpr0 = COPY [[COPY2]] ; GCN: $vgpr1 = COPY [[COPY3]] ; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1 @@ -61,10 +142,10 @@ ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 - ; GCN: [[WWM:%[0-9]+]]:vreg_96 = WWM [[REG_SEQUENCE]], implicit $exec - ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub0 - ; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub1 - ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub2 + ; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_96 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0 + ; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1 + ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub2 ; GCN: $vgpr0 = COPY [[COPY3]] ; GCN: $vgpr1 = COPY [[COPY4]] ; GCN: $vgpr2 = COPY [[COPY5]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wwm.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wwm.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wwm.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wwm.mir @@ -3,30 +3,30 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s --- -name: wwm_s +name: strict_wwm_s legalized: true body: | bb.0: liveins: $sgpr0 - ; CHECK-LABEL: name: wwm_s + ; CHECK-LABEL: name: strict_wwm_s ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), [[COPY1]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), [[COPY1]](s32) %0:_(s32) = COPY $sgpr0 - %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), %0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), %0 ... --- -name: wwm_v +name: strict_wwm_v legalized: true body: | bb.0: liveins: $vgpr0 - ; CHECK-LABEL: name: wwm_v + ; CHECK-LABEL: name: strict_wwm_v ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), [[COPY]](s32) + ; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), [[COPY]](s32) %0:_(s32) = COPY $vgpr0 - %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), %0 + %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), %0 ... diff --git a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll --- a/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-wwm-vgpr-copy.ll @@ -1,6 +1,46 @@ ; RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -define amdgpu_hs void @foo(i32 inreg %arg, <4 x i32> inreg %buffer) { +; GCN-LABEL: strict_wwm: +define amdgpu_hs void @strict_wwm(i32 inreg %arg, <4 x i32> inreg %buffer) { +entry: + br label %work + +bb42: + br label %bb602 + +bb602: + %tmp603 = phi i32 [ 0, %bb42 ], [ 1, %work ] + %tmp607 = icmp eq i32 %tmp603, %tmp1196 + br i1 %tmp607, label %bb49, label %bb54 + +bb49: + call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1) + ret void + +bb54: + ret void + +work: +; GCN: s_not_b64 exec, exec +; GCN: v_mov_b32_e32 v[[tmp1189:[0-9]+]], 1 +; GCN: s_not_b64 exec, exec + %tmp1189 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 4, i32 1) + +; GCN: s_or_saveexec_b64 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -1 +; GCN: v_lshlrev_b32_e32 v[[tmp1191:[0-9]+]], 2, v[[tmp1189]] + %tmp1191 = mul i32 %tmp1189, 4 + +; GCN: s_mov_b64 exec, s{{\[}}[[LO]]:[[HI]]{{\]}} + %tmp1196 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp1191) + + %tmp34 = icmp eq i32 %arg, 0 + br i1 %tmp34, label %bb602, label %bb42 +} + +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. + +; GCN-LABEL: wwm: +define amdgpu_hs void @wwm(i32 inreg %arg, <4 x i32> inreg %buffer) { entry: br label %work @@ -38,6 +78,7 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 declare i32 @llvm.amdgcn.wwm.i32(i32) #1 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2 attributes #0 = { convergent nounwind readnone willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -77,6 +77,7 @@ ret float %out.0 } +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. ; Make sure the transition from Exact to WWM then softwqm does not trigger WQM. ; ;CHECK-LABEL: {{^}}test_wwm1: @@ -183,6 +184,7 @@ declare float @llvm.amdgcn.wqm.f32(float) #3 declare float @llvm.amdgcn.softwqm.f32(float) #3 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wwm.f32(float) #3 declare float @llvm.amdgcn.wwm.f32(float) #3 attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -701,6 +701,50 @@ ret <4 x float> %c.iv } +; GCN-LABEL: {{^}}test_strict_wwm1: +; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1 +; GFX1032: s_mov_b32 exec_lo, [[SAVE]] +; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1 +; GFX1064: s_mov_b64 exec, [[SAVE]] +define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { +main_body: + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + ret float %out.0 +} + +; GCN-LABEL: {{^}}test_strict_wwm2: +; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 32, v{{[0-9]+}} +; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo +; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1 +; GFX1032: s_mov_b32 exec_lo, [[SAVE2]] +; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]] +; GFX1064: v_cmp_gt_u32_e32 vcc, 32, v{{[0-9]+}} +; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}} +; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1 +; GFX1064: s_mov_b64 exec, [[SAVE2]] +; GFX1064: s_or_b64 exec, exec, [[SAVE1]] +define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. ; GCN-LABEL: {{^}}test_wwm1: ; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1 ; GFX1032: s_mov_b32 exec_lo, [[SAVE]] @@ -1123,6 +1167,7 @@ declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare float @llvm.amdgcn.strict.wwm.f32(float) declare float @llvm.amdgcn.wwm.f32(float) declare i32 @llvm.amdgcn.wqm.i32(i32) declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -146,37 +146,37 @@ ret float %out.2 } -; Check that WWM is triggered by the wwm intrinsic. +; Check that WWM is triggered by the strict_wwm intrinsic. ; -;CHECK-LABEL: {{^}}test_wwm1: +;CHECK-LABEL: {{^}}test_strict_wwm1: ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %out = fadd float %src0, %src1 - %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) ret float %out.0 } ; Same as above, but with an integer type. ; -;CHECK-LABEL: {{^}}test_wwm2: +;CHECK-LABEL: {{^}}test_strict_wwm2: ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ;CHECK: buffer_load_dword ;CHECK: buffer_load_dword ;CHECK: v_add_{{[iu]}}32_e32 -define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %src0.0 = bitcast float %src0 to i32 %src1.0 = bitcast float %src1 to i32 %out = add i32 %src0.0, %src1.0 - %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) %out.1 = bitcast i32 %out.0 to float ret float %out.1 } @@ -185,13 +185,13 @@ ; since that will lead clobbering things that aren't supposed to be clobbered ; in cases like this. ; -;CHECK-LABEL: {{^}}test_wwm3: +;CHECK-LABEL: {{^}}test_strict_wwm3: ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: v_add_f32_e32 -define amdgpu_ps float @test_wwm3(i32 inreg %idx) { +define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -202,7 +202,7 @@ if: %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src - %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) %out.1 = fadd float %src, %out.0 br label %endif @@ -214,13 +214,13 @@ ; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM ; write could clobber disabled channels in the non-WWM one. ; -;CHECK-LABEL: {{^}}test_wwm4: +;CHECK-LABEL: {{^}}test_strict_wwm4: ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;CHECK: buffer_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK-NEXT: v_mov_b32_e32 -define amdgpu_ps float @test_wwm4(i32 inreg %idx) { +define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { main_body: ; use mbcnt to make sure the branch is divergent %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) @@ -231,7 +231,7 @@ if: %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) %out = fadd float %src, %src - %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) br label %endif endif: @@ -241,7 +241,7 @@ ; Make sure the transition from Exact to WWM then WQM works properly. ; -;CHECK-LABEL: {{^}}test_wwm5: +;CHECK-LABEL: {{^}}test_strict_wwm5: ;CHECK: buffer_load_dword ;CHECK: buffer_store_dword ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 @@ -249,13 +249,13 @@ ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG]] ;CHECK: s_wqm_b64 exec, exec -define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { main_body: %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) %temp = fadd float %src1, %src1 - %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) + %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) %out = fadd float %temp.0, %temp.0 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) ret float %out.0 @@ -264,7 +264,7 @@ ; Check that WWM is turned on correctly across basic block boundaries. ; if..then..endif version ; -;CHECK-LABEL: {{^}}test_wwm6_then: +;CHECK-LABEL: {{^}}test_strict_wwm6_then: ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword @@ -275,7 +275,7 @@ ;VI-CHECK: flat_load_dword ;CHECK: v_add_f32_e32 ;CHECK: s_mov_b64 exec, [[ORIG2]] -define amdgpu_ps float @test_wwm6_then() { +define amdgpu_ps float @test_strict_wwm6_then() { main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -287,7 +287,7 @@ if: %src1 = load volatile float, float addrspace(1)* undef %out = fadd float %src0, %src1 - %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) br label %endif endif: @@ -298,7 +298,7 @@ ; Check that WWM is turned on correctly across basic block boundaries. ; loop version ; -;CHECK-LABEL: {{^}}test_wwm6_loop: +;CHECK-LABEL: {{^}}test_strict_wwm6_loop: ;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword @@ -308,7 +308,7 @@ ;SI-CHECK: buffer_load_dword ;VI-CHECK: flat_load_dword ;CHECK: s_mov_b64 exec, [[ORIG2]] -define amdgpu_ps float @test_wwm6_loop() { +define amdgpu_ps float @test_strict_wwm6_loop() { main_body: %src0 = load volatile float, float addrspace(1)* undef ; use mbcnt to make sure the branch is divergent @@ -320,7 +320,7 @@ %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] %src1 = load volatile float, float addrspace(1)* undef %out = fadd float %src0, %src1 - %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) %counter.1 = sub i32 %counter, 1 %cc = icmp ne i32 %counter.1, 0 br i1 %cc, label %loop, label %endloop @@ -344,7 +344,7 @@ %src.0 = bitcast float %src to i32 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) %out = add i32 %src.1, %src.1 - %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) %out.1 = bitcast i32 %out.0 to float call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) ret void @@ -791,6 +791,246 @@ ret <4 x float> %r } +; Check a case of a block being entirely WQM except for a bit of WWM. +; There was a bug where it forgot to enter and leave WWM. +; +;CHECK-LABEL: {{^}}test_strict_wwm_within_wqm: +;CHECK: %IF +;CHECK: s_or_saveexec_b64 {{.*}}, -1 +;CHECK: ds_swizzle +; +define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { +main_body: + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %tex0 = extractelement <4 x float> %tex, i32 0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ENDIF + +IF: + %dataf = extractelement <4 x float> %dtex, i32 0 + %data1 = fptosi float %dataf to i32 + %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) + %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) + %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3) + %data4f = sitofp i32 %data4 to float + br label %ENDIF + +ENDIF: + %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] + ret float %r +} + +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. + +; Check that WWM is triggered by the wwm intrinsic. +; +;CHECK-LABEL: {{^}}test_wwm1: +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + ret float %out.0 +} + +; Same as above, but with an integer type. +; +;CHECK-LABEL: {{^}}test_wwm2: +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: buffer_load_dword +;CHECK: buffer_load_dword +;CHECK: v_add_{{[iu]}}32_e32 +define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %src0.0 = bitcast float %src0 to i32 + %src1.0 = bitcast float %src1 to i32 + %out = add i32 %src0.0, %src1.0 + %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + ret float %out.1 +} + +; Check that we don't leave WWM on for computations that don't require WWM, +; since that will lead clobbering things that aren't supposed to be clobbered +; in cases like this. +; +;CHECK-LABEL: {{^}}test_wwm3: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: v_add_f32_e32 +define amdgpu_ps float @test_wwm3(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %out.1 = fadd float %src, %out.0 + br label %endif + +endif: + %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] + ret float %out.2 +} + + +; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM +; write could clobber disabled channels in the non-WWM one. +; +;CHECK-LABEL: {{^}}test_wwm4: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK-NEXT: v_mov_b32_e32 +define amdgpu_ps float @test_wwm4(i32 inreg %idx) { +main_body: + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %out = fadd float %src, %src + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + +; Make sure the transition from Exact to WWM then WQM works properly. +; +;CHECK-LABEL: {{^}}test_wwm5: +;CHECK: buffer_load_dword +;CHECK: buffer_store_dword +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;CHECK: buffer_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: s_wqm_b64 exec, exec +define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { +main_body: + %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) + %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) + %temp = fadd float %src1, %src1 + %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) + %out = fadd float %temp.0, %temp.0 + %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) + ret float %out.0 +} + +; Check that WWM is turned on correctly across basic block boundaries. +; if..then..endif version +; +;CHECK-LABEL: {{^}}test_wwm6_then: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %if +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: v_add_f32_e32 +;CHECK: s_mov_b64 exec, [[ORIG2]] +define amdgpu_ps float @test_wwm6_then() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %cc = icmp uge i32 %hi, 32 + br i1 %cc, label %endif, label %if + +if: + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + br label %endif + +endif: + %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] + ret float %out.1 +} + + +; Check that WWM is turned on correctly across basic block boundaries. +; loop version +; +;CHECK-LABEL: {{^}}test_wwm6_loop: +;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG]] +;CHECK: %loop +;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1 +;SI-CHECK: buffer_load_dword +;VI-CHECK: flat_load_dword +;CHECK: s_mov_b64 exec, [[ORIG2]] +define amdgpu_ps float @test_wwm6_loop() { +main_body: + %src0 = load volatile float, float addrspace(1)* undef + ; use mbcnt to make sure the branch is divergent + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + br label %loop + +loop: + %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ] + %src1 = load volatile float, float addrspace(1)* undef + %out = fadd float %src0, %src1 + %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) + %counter.1 = sub i32 %counter, 1 + %cc = icmp ne i32 %counter.1, 0 + br i1 %cc, label %loop, label %endloop + +endloop: + ret float %out.0 +} + +; Check that @llvm.amdgcn.set.inactive disables WWM. +; +;CHECK-LABEL: {{^}}test_wwm_set_inactive1: +;CHECK: buffer_load_dword +;CHECK: s_not_b64 exec, exec +;CHECK: v_mov_b32_e32 +;CHECK: s_not_b64 exec, exec +;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 +;CHECK: v_add_{{[iu]}}32_e32 +define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { +main_body: + %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + %src.0 = bitcast float %src to i32 + %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) + %out = add i32 %src.1, %src.1 + %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) + %out.1 = bitcast i32 %out.0 to float + call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0) + ret void +} + + ; Check a case of a block being entirely WQM except for a bit of WWM. ; There was a bug where it forgot to enter and leave WWM. ; @@ -838,6 +1078,8 @@ declare void @llvm.amdgcn.kill(i1) #1 declare float @llvm.amdgcn.wqm.f32(float) #3 declare i32 @llvm.amdgcn.wqm.i32(i32) #3 +declare float @llvm.amdgcn.strict.wwm.f32(float) #3 +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 declare float @llvm.amdgcn.wwm.f32(float) #3 declare i32 @llvm.amdgcn.wwm.i32(i32) #3 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -3,10 +3,10 @@ --- # Check for awareness that s_or_saveexec_b64 clobbers SCC # -#CHECK: ENTER_WWM +#CHECK: ENTER_STRICT_WWM #CHECK: S_CMP_LT_I32 #CHECK: S_CSELECT_B32 -name: test_wwm_scc +name: test_strict_wwm_scc alignment: 1 exposesReturnsTwice: false legalized: false @@ -44,7 +44,7 @@ %12 = V_ADD_CO_U32_e32 %3, %3, implicit-def $vcc, implicit $exec %5 = S_CSELECT_B32 %2, %1, implicit $scc %11 = V_ADD_CO_U32_e32 %5, %12, implicit-def $vcc, implicit $exec - $vgpr0 = WWM %11, implicit $exec + $vgpr0 = STRICT_WWM %11, implicit $exec SI_RETURN_TO_EPILOG $vgpr0 ... @@ -56,10 +56,10 @@ #CHECK: %bb.1 #CHECK: S_CMP_LT_I32 #CHECK: COPY $scc -#CHECK: ENTER_WWM +#CHECK: ENTER_STRICT_WWM #CHECK: $scc = COPY #CHECK: S_CSELECT_B32 -name: test_wwm_scc2 +name: test_strict_wwm_scc2 tracksRegLiveness: true body: | bb.0: @@ -77,7 +77,7 @@ %12:vgpr_32 = V_ADD_CO_U32_e32 %3:vgpr_32, %3:vgpr_32, implicit-def $vcc, implicit $exec %5:sgpr_32 = S_CSELECT_B32 %2:sgpr_32, %1:sgpr_32, implicit $scc %11:vgpr_32 = V_ADD_CO_U32_e32 %5:sgpr_32, %12:vgpr_32, implicit-def $vcc, implicit $exec - $vgpr0 = WWM %11:vgpr_32, implicit $exec + $vgpr0 = STRICT_WWM %11:vgpr_32, implicit $exec $vgpr1 = COPY %10:vgpr_32 SI_RETURN_TO_EPILOG $vgpr0, $vgpr1 @@ -136,19 +136,19 @@ %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc %14:vgpr_32 = COPY %7 %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec - early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec + early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- -# Ensure that wwm is not put around an EXEC copy +# Ensure that strict_wwm is not put around an EXEC copy #CHECK-LABEL: name: copy_exec #CHECK: %7:sreg_64 = COPY $exec -#CHECK-NEXT: %14:sreg_64 = ENTER_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec +#CHECK-NEXT: %14:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec #CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec -#CHECK-NEXT: $exec = EXIT_WWM %14 +#CHECK-NEXT: $exec = EXIT_STRICT_WWM %14 #CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec name: copy_exec tracksRegLiveness: true @@ -169,7 +169,7 @@ %10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec %11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec %12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63 - early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec + early-clobber %13:sreg_32 = STRICT_WWM %9:vgpr_32, implicit $exec %14:vgpr_32 = COPY %13 BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -17,14 +17,14 @@ ; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) %tmp121 = add i32 %tmp105, %tmp120 - %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) + %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121) ; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] ; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) %tmp136 = add i32 %tmp107, %tmp135 - %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) + %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] ; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] @@ -51,7 +51,7 @@ ; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]] %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) %tmp121 = add i32 %tmp105, %tmp120 - %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) + %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121) %cond = icmp eq i32 %arg, 0 br i1 %cond, label %if, label %merge @@ -65,7 +65,7 @@ ; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]] %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) %tmp136 = add i32 %tmp107, %tmp135 - %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) + %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) br label %merge merge: @@ -114,7 +114,7 @@ ; GFX9: v_mov_b32_e32 v1, v0 ; GFX9: v_add_u32_e32 v1, v1, v2 %tmp136 = add i32 %tmp134, %tmp107 - %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) + %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) ; GFX9: buffer_store_dword v0 call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0) ret void @@ -151,7 +151,7 @@ ; GFX9: s_swappc_b64 %tmp134 = call i64 @called_i64(i64 %tmp107) %tmp136 = add i64 %tmp134, %tmp107 - %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136) + %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136) %tmp138 = bitcast i64 %tmp137 to <2 x i32> ; GFX9: buffer_store_dwordx2 call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0) @@ -165,6 +165,194 @@ %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> %tmp19 = or i32 %tmp17, 16 +; GFX9: buffer_load_dwordx2 + %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) + %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 + %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807) + %tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22) + %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1 + %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807) + %tmp174 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp99) + %.i25 = bitcast <2 x i32> %tmp20 to i64 + %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807) + %tmp251 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp176) + %.cast = bitcast i64 %tmp97 to <2 x float> + %.cast6 = bitcast i64 %tmp174 to <2 x float> + %.cast7 = bitcast i64 %tmp251 to <2 x float> + %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> +; GFX9: buffer_store_dwordx4 + tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0) + ; GFX9: buffer_store_dwordx2 + tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0) + ret void +} + +; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. + +; GFX9-LABEL: {{^}}wwm_no_cfg: +define amdgpu_cs void @wwm_no_cfg(<4 x i32> inreg %tmp14) { + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> + %tmp102 = extractelement <2 x i32> %tmp101, i32 0 + %tmp103 = extractelement <2 x i32> %tmp101, i32 1 + %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) + %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) + +; GFX9: s_or_saveexec_b64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, -1 + +; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] +; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] + %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) + %tmp121 = add i32 %tmp105, %tmp120 + %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) + +; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] +; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] + %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) + %tmp136 = add i32 %tmp107, %tmp135 + %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) + +; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] +; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] + %tmp138 = icmp eq i32 %tmp122, %tmp137 + %tmp139 = sext i1 %tmp138 to i32 + %tmp140 = shl nsw i32 %tmp139, 1 + %tmp141 = and i32 %tmp140, 2 + %tmp145 = bitcast i32 %tmp141 to float + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +; GFX9-LABEL: {{^}}wwm_cfg: +define amdgpu_cs void @wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { +entry: + %tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0) + %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> + %tmp102 = extractelement <2 x i32> %tmp101, i32 0 + %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) + +; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] +; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] +; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]] + %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) + %tmp121 = add i32 %tmp105, %tmp120 + %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) + + %cond = icmp eq i32 %arg, 0 + br i1 %cond, label %if, label %merge +if: + %tmp103 = extractelement <2 x i32> %tmp101, i32 1 + %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) + +; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] +; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] +; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]] + %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) + %tmp136 = add i32 %tmp107, %tmp135 + %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) + br label %merge + +merge: + %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ] +; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] +; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]] +; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]] +; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] + %tmp138 = icmp eq i32 %tmp122, %merge_value + %tmp139 = sext i1 %tmp138 to i32 + %tmp140 = shl nsw i32 %tmp139, 1 + %tmp141 = and i32 %tmp140, 2 + %tmp145 = bitcast i32 %tmp141 to float + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +; GFX9-LABEL: {{^}}wwm_called: +define hidden i32 @wwm_called(i32 %a) noinline { +; GFX9: v_add_u32_e32 v1, v0, v0 + %add = add i32 %a, %a +; GFX9: v_mul_lo_u32 v0, v1, v0 + %mul = mul i32 %add, %a +; GFX9: v_sub_u32_e32 v0, v0, v1 + %sub = sub i32 %mul, %add + ret i32 %sub +} + +; GFX9-LABEL: {{^}}wwm_call: +define amdgpu_kernel void @wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) { +; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]] +; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}} +; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]] +; GFX9-O0-DAG: v_mov_b32_e32 v2, v0 + +; GFX9-O3: v_mov_b32_e32 v2, [[ARG]] + +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_not_b64 exec, exec + %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) +; GFX9: v_mov_b32_e32 v0, v2 +; GFX9: s_swappc_b64 + %tmp134 = call i32 @wwm_called(i32 %tmp107) +; GFX9: v_mov_b32_e32 v1, v0 +; GFX9: v_add_u32_e32 v1, v1, v2 + %tmp136 = add i32 %tmp134, %tmp107 + %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) +; GFX9: buffer_store_dword v0 + call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +; GFX9-LABEL: {{^}}wwm_called_i64: +define i64 @wwm_called_i64(i64 %a) noinline { + %add = add i64 %a, %a + %mul = mul i64 %add, %a + %sub = sub i64 %mul, %add + ret i64 %sub +} + +; GFX9-LABEL: {{^}}wwm_call_i64: +define amdgpu_kernel void @wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) { +; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}} + +; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}} +; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]] +; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]] +; GFX9-O0-DAG: v_mov_b32_e32 v10, v1 +; GFX9-O0-DAG: v_mov_b32_e32 v9, v0 + +; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]] +; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]] + +; GFX9: s_not_b64 exec, exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s[[ZERO_LO]] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s[[ZERO_HI]] +; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_not_b64 exec, exec + %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0) +; GFX9: s_swappc_b64 + %tmp134 = call i64 @wwm_called_i64(i64 %tmp107) + %tmp136 = add i64 %tmp134, %tmp107 + %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136) + %tmp138 = bitcast i64 %tmp137 to <2 x i32> +; GFX9: buffer_store_dwordx2 + call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0) + ret void +} + +; GFX9-LABEL: {{^}}wwm_amdgpu_cs_main: +define amdgpu_cs void @wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { + %tmp17 = shl i32 %index, 5 +; GFX9: buffer_load_dwordx4 + %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) + %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> + %tmp19 = or i32 %tmp17, 16 ; GFX9: buffer_load_dwordx2 %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 @@ -187,6 +375,8 @@ ret void } +declare i32 @llvm.amdgcn.strict.wwm.i32(i32) +declare i64 @llvm.amdgcn.strict.wwm.i64(i64) declare i32 @llvm.amdgcn.wwm.i32(i32) declare i64 @llvm.amdgcn.wwm.i64(i64) declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)