Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -869,7 +869,9 @@ return true; } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { - TII->moveToVALU(MI, MDT); + SIInstrWorklist worklist; + worklist.insert(&MI); + TII->moveToVALU(worklist, MDT); return true; } @@ -991,6 +993,10 @@ LoweringWorklist.push_back(C.second.ID); } + // Store all the V2S copy instructions that need to be moved to VALU + // in the Copies worklist. + SIInstrWorklist Copies; + while (!LoweringWorklist.empty()) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); @@ -1013,10 +1019,13 @@ LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); V2SCopies.erase(C.ID); - TII->moveToVALU(*C.Copy, MDT); + Copies.insert(C.Copy); } } + TII->moveToVALU(Copies, MDT); + Copies.clear(); + // Now do actual lowering for (auto C : V2SCopies) { MachineInstr *MI = C.second.Copy; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -41,6 +41,35 @@ static const MachineMemOperand::Flags MONoClobber = MachineMemOperand::MOTargetFlag1; +// Utility to store machine instructions worklist in order +// as defined by comparator Cmp. +struct SIInstrWorklist { + SIInstrWorklist() : InstrSet() {} + + void insert(MachineInstr *MI) { InstrSet.insert(MI); } + + MachineInstr *top() const { + auto iter = InstrSet.begin(); + return *iter; + } + + void erase_top() { + auto iter = InstrSet.begin(); + InstrSet.erase(iter); + } + + bool empty() const { return InstrSet.empty(); } + + void clear() { InstrSet.clear(); } + +private: + struct Cmp { + Cmp() {} + bool operator()(const MachineInstr *InstA, const MachineInstr *InstB) const; + }; + std::set InstrSet; +}; + class SIInstrInfo final : public AMDGPUGenInstrInfo { private: const SIRegisterInfo RI; @@ -81,57 +110,50 @@ void swapOperands(MachineInstr &Inst) const; std::pair - moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + void lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerScalarAbs(SetVectorType &Worklist, - MachineInstr &Inst) const; + void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void lowerScalarXnor(SetVectorType &Worklist, - MachineInstr &Inst) const; + void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void splitScalarNotBinop(SetVectorType &Worklist, - MachineInstr &Inst, + void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalarBinOpN2(SetVectorType &Worklist, - MachineInstr &Inst, + void splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitUnaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode, - bool Swap = false) const; + void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, + unsigned Opcode, bool Swap = false) const; - void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBCNT(SetVectorType &Worklist, + void splitScalar64BitBCNT(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SetVectorType &Worklist, - MachineInstr &Inst) const; - void movePackToVALU(SetVectorType &Worklist, - MachineRegisterInfo &MRI, + void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, - SetVectorType &Worklist) const; + SIInstrWorklist &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + SIInstrWorklist &Worklist, Register NewCond = Register()) const; void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const; + SIInstrWorklist &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; @@ -1008,11 +1030,11 @@ /// was moved to VGPR. \returns true if succeeded. bool moveFlatAddrToVGPR(MachineInstr &Inst) const; - /// Replace this instruction's opcode with the equivalent VALU - /// opcode. This function will also move the users of \p MI to the - /// VALU if necessary. If present, \p MDT is updated. - MachineBasicBlock *moveToVALU(MachineInstr &MI, - MachineDominatorTree *MDT = nullptr) const; + /// Replace the instructions opcode with the equivalent VALU + /// opcode. This function will also move the users of MachineInstruntions + /// in the \p WorkList to the VALU if necessary. If present, \p MDT is + /// updated. + void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const; void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -17,6 +17,7 @@ #include "GCNHazardRecognizer.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" @@ -61,6 +62,46 @@ cl::init(true), cl::ReallyHidden); +static std::vector +GetRPOList(const MachineFunction *MF) { + if (MF->empty()) + return {}; + ReversePostOrderTraversal RPOT(&*MF->begin()); + std::vector RPOList; + append_range(RPOList, RPOT); + return RPOList; +} + +// Compare machine instructions based on their order +// in the machine function. Returns true if first instruction +// occurs before second instruction in the machine function. +bool SIInstrWorklist::Cmp::operator()(const MachineInstr *InstA, + const MachineInstr *InstB) const { + if (!InstA || !InstB) + return false; + if (InstA == InstB) + return false; + const MachineBasicBlock *BA = InstA->getParent(); + const MachineBasicBlock *BB = InstB->getParent(); + if (!BA || !BB) + return false; + if (BA != BB) { + const MachineFunction *MF = BA->getParent(); + auto RPOList = GetRPOList(MF); + auto itA = std::find(RPOList.begin(), RPOList.end(), BA); + auto itB = std::find(RPOList.begin(), RPOList.end(), BB); + return std::distance(RPOList.begin(), itA) < + std::distance(RPOList.begin(), itB); + } + auto getDist = [](const MachineInstr *MI) { + MachineBasicBlock::const_iterator I = MI->getParent()->begin(); + MachineBasicBlock::const_iterator E = MI->getIterator(); + unsigned D = std::distance(I, E); + return D; + }; + return getDist(InstA) < getDist(InstB); +} + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -6155,21 +6196,18 @@ return CreatedBB; } -MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, - MachineDominatorTree *MDT) const { - SetVectorType Worklist; - Worklist.insert(&TopInst); - MachineBasicBlock *CreatedBB = nullptr; - MachineBasicBlock *CreatedBBTmp = nullptr; +void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, + MachineDominatorTree *MDT) const { while (!Worklist.empty()) { - MachineInstr &Inst = *Worklist.pop_back_val(); + MachineInstr &Inst = *Worklist.top(); + Worklist.erase_top(); MachineBasicBlock *MBB = Inst.getParent(); + if (!MBB) + continue; MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Opcode = Inst.getOpcode(); unsigned NewOpcode = getVALUOp(Inst); - // Handle some special cases switch (Opcode) { default: @@ -6183,9 +6221,8 @@ case AMDGPU::S_SUB_I32: { // FIXME: The u32 versions currently selected use the carry. bool Changed; + MachineBasicBlock *CreatedBBTmp = nullptr; std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; if (Changed) continue; @@ -6363,7 +6400,7 @@ Register CarryInReg = Inst.getOperand(4).getReg(); if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { Register NewCarryReg = MRI.createVirtualRegister(CarryRC); - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) .addReg(CarryInReg); } @@ -6378,9 +6415,7 @@ .add(Inst.getOperand(3)) .addReg(CarryInReg) .addImm(0); - CreatedBBTmp = legalizeOperands(*CarryOp); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; + legalizeOperands(*CarryOp); MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); Inst.eraseFromParent(); @@ -6406,10 +6441,7 @@ .add(Src1) .addImm(0); // clamp bit - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - + legalizeOperands(*NewInstr, MDT); MRI.replaceRegWith(Dest0.getReg(), DestReg); addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, Worklist); @@ -6454,17 +6486,16 @@ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. - CreatedBBTmp = legalizeOperands(Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; + legalizeOperands(Inst, MDT); continue; } - // Handle converting generic instructions like COPY-to-SGPR into // COPY-to-VGPR. if (NewOpcode == Opcode) { Register DstReg = Inst.getOperand(0).getReg(); const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + if (!NewDstRC) + continue; if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { @@ -6487,7 +6518,6 @@ Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); continue; } - Register NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); legalizeOperands(Inst, MDT); @@ -6513,23 +6543,18 @@ addSCCDefsToVALUWorklist(NewInstr, Worklist); } } - Inst.eraseFromParent(); - Register NewDstReg; if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { Register DstReg = NewInstr->getOperand(0).getReg(); assert(DstReg.isVirtual()); - // Update the destination register class. const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr); assert(NewDstRC); - NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. @@ -6541,7 +6566,6 @@ // extra 0 operand. NewInstr.addImm(0); } - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); // If we need to move this to VGPRs, we need to unpack the second operand @@ -6549,30 +6573,24 @@ assert(OffsetWidthOp.isImm() && "Scalar BFE is only implemented for constant width and offset"); uint32_t Imm = OffsetWidthOp.getImm(); - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. NewInstr->removeOperand(2); NewInstr.addImm(Offset); NewInstr.addImm(BitWidth); } - fixImplicitOperands(*NewInstr); - // Legalize the operands - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - + legalizeOperands(*NewInstr, MDT); if (NewDstReg) addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } - return CreatedBB; + return; } // Add/sub require special handling to deal with carry outs. std::pair -SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, +SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. @@ -6607,7 +6625,7 @@ return std::pair(false, nullptr); } -void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, +void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6683,7 +6701,7 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6710,7 +6728,7 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6775,7 +6793,7 @@ } } -void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, +void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6804,7 +6822,7 @@ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, +void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6833,9 +6851,9 @@ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitUnaryOp( - SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode, bool Swap) const { +void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, + MachineInstr &Inst, unsigned Opcode, + bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6892,7 +6910,7 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -6966,7 +6984,7 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7033,7 +7051,7 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7075,8 +7093,8 @@ Worklist.insert(&Xor); } -void SIInstrInfo::splitScalar64BitBCNT( - SetVectorType &Worklist, MachineInstr &Inst) const { +void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7113,7 +7131,7 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7175,9 +7193,8 @@ } void SIInstrInfo::addUsersToMoveToVALUWorklist( - Register DstReg, - MachineRegisterInfo &MRI, - SetVectorType &Worklist) const { + Register DstReg, MachineRegisterInfo &MRI, + SIInstrWorklist &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); @@ -7211,7 +7228,7 @@ } } -void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, +void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -7286,7 +7303,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + SIInstrWorklist &Worklist, Register NewCond) const { // Ensure that def inst defines SCC, which is still live. @@ -7329,7 +7346,7 @@ // sure that the instruction that defines SCC is added to the moveToVALU // worklist. void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const { + SIInstrWorklist &Worklist) const { // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be Index: llvm/test/CodeGen/AMDGPU/add3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/add3.ll +++ llvm/test/CodeGen/AMDGPU/add3.ll @@ -222,7 +222,7 @@ ; VI-NEXT: v_add_f32_e64 v1, s3, 2.0 ; VI-NEXT: v_mov_b32_e32 v2, 0x40400000 ; VI-NEXT: v_add_f32_e32 v2, s4, v2 -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s @@ -150,7 +150,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: s_mul_i32 s0, s0, s3 @@ -261,7 +261,7 @@ ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_xor_b32 s0, s9, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s1, v0 ; GFX6-NEXT: s_mul_i32 s1, s1, s3 @@ -1226,7 +1226,7 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -1266,7 +1266,7 @@ ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3 ; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[6:7] -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GFX6-NEXT: v_mul_hi_u32 v5, s11, v5 ; GFX6-NEXT: v_readfirstlane_b32 s0, v5 ; GFX6-NEXT: s_mul_i32 s0, s0, s15 @@ -1538,7 +1538,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -1556,7 +1556,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -1871,7 +1871,7 @@ ; GFX6-NEXT: s_xor_b32 s5, s5, s4 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s8, s4, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s3 @@ -1901,7 +1901,7 @@ ; GFX6-NEXT: s_xor_b32 s7, s7, s6 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: s_xor_b32 s9, s6, s4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 ; GFX6-NEXT: v_readfirstlane_b32 s4, v2 ; GFX6-NEXT: s_mul_i32 s4, s4, s5 @@ -1963,7 +1963,7 @@ ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v2, v3, v2 ; GFX6-NEXT: s_xor_b32 s0, s0, s10 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s12, v4 ; GFX6-NEXT: v_readfirstlane_b32 s2, v3 @@ -2318,7 +2318,7 @@ ; GFX6-NEXT: s_add_i32 s5, s5, s8 ; GFX6-NEXT: s_xor_b32 s5, s5, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s4 @@ -2344,7 +2344,7 @@ ; GFX6-NEXT: s_xor_b32 s5, s5, s4 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_sub_i32 s6, s2, s8 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s2, v0 ; GFX6-NEXT: s_mul_i32 s2, s2, s3 @@ -5112,7 +5112,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 @@ -5754,7 +5754,7 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s0, v1 ; GFX6-NEXT: s_mul_i32 s0, s0, s6 @@ -6090,7 +6090,7 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 ; GFX6-NEXT: s_mul_i32 s7, s7, s6 @@ -6178,7 +6178,7 @@ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -6269,7 +6269,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-NEXT: s_mul_i32 s0, s0, s3 @@ -6416,7 +6416,7 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -6553,7 +6553,7 @@ ; GFX6-NEXT: s_xor_b32 s4, s4, s6 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s6, s6, s3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_readfirstlane_b32 s3, v0 ; GFX6-NEXT: s_mul_i32 s3, s3, s2 @@ -6586,7 +6586,7 @@ ; GFX6-NEXT: s_xor_b32 s5, s5, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 ; GFX6-NEXT: s_xor_b32 s4, s8, s4 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX6-NEXT: v_readfirstlane_b32 s6, v1 ; GFX6-NEXT: s_mul_i32 s6, s6, s7 @@ -6797,7 +6797,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 ; GFX6-NEXT: s_mul_i32 s7, s7, s4 @@ -7043,7 +7043,7 @@ ; GFX6-NEXT: s_xor_b32 s5, s2, s8 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s7, v0 ; GFX6-NEXT: s_mul_i32 s7, s7, s4 @@ -7159,7 +7159,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 @@ -7183,9 +7183,9 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -7609,7 +7609,7 @@ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0 ; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 ; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc @@ -7845,7 +7845,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 @@ -7870,9 +7870,9 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8294,8 +8294,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s1, s5 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8316,7 +8316,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -8606,8 +8606,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: s_addc_u32 s3, s3, s12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8629,7 +8629,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -8962,9 +8962,9 @@ ; GFX6-NEXT: s_add_u32 s0, s0, s8 ; GFX6-NEXT: s_addc_u32 s1, s1, 0 ; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -8990,7 +8990,7 @@ ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -9250,8 +9250,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -9274,7 +9274,7 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -9310,9 +9310,9 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mov_b32_e32 v5, s13 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc @@ -9364,9 +9364,9 @@ ; GFX6-NEXT: s_subb_u32 s13, 0, s3 ; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 @@ -9389,7 +9389,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 @@ -9431,9 +9431,9 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 @@ -9796,7 +9796,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 ; GFX6-NEXT: v_mul_lo_u32 v4, v0, s8 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 @@ -9816,9 +9816,9 @@ ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -10110,8 +10110,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 ; GFX6-NEXT: s_addc_u32 s3, s3, s10 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -10135,7 +10135,7 @@ ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -10171,8 +10171,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 ; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 @@ -10484,8 +10484,8 @@ ; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 ; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -10507,7 +10507,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -10542,8 +10542,8 @@ ; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 ; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -10594,9 +10594,9 @@ ; GFX6-NEXT: s_subb_u32 s1, 0, s5 ; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: s_ashr_i32 s14, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3 ; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3 @@ -10621,7 +10621,7 @@ ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 ; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 ; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 @@ -10662,8 +10662,8 @@ ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 Index: llvm/test/CodeGen/AMDGPU/carryout-selection.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -1800,8 +1800,8 @@ ; CISI-NEXT: v_mul_hi_u32 v3, s0, v0 ; CISI-NEXT: v_mul_lo_u32 v5, s1, v0 ; CISI-NEXT: v_mul_lo_u32 v4, s0, v0 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CISI-NEXT: v_mul_hi_u32 v3, v0, v4 ; CISI-NEXT: v_mul_lo_u32 v5, v0, v2 ; CISI-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1821,7 +1821,7 @@ ; CISI-NEXT: v_mul_lo_u32 v2, s0, v1 ; CISI-NEXT: v_mul_hi_u32 v3, s0, v0 ; CISI-NEXT: v_mul_lo_u32 v4, s1, v0 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CISI-NEXT: v_mul_lo_u32 v3, s0, v0 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CISI-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -1949,8 +1949,8 @@ ; VI-NEXT: v_mul_lo_u32 v2, s8, v4 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 ; VI-NEXT: v_mul_lo_u32 v3, s9, v5 -; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0 ; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 @@ -1970,7 +1970,7 @@ ; VI-NEXT: v_mul_hi_u32 v8, v6, v0 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5 ; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0 ; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4 Index: llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -1,5 +1,6 @@ -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VMEM -check-prefix=GCN %s -; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn -amdgpu-spill-sgpr-to-vgpr=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VGPR -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn --amdhsa-code-object-version=2 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VMEM %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn --amdhsa-code-object-version=2 -amdgpu-spill-sgpr-to-vgpr=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VGPR %s ; Verify registers used for tracking exec mask changes when all ; registers are spilled at the end of the block. The SGPR spill @@ -8,62 +9,267 @@ ; FIXME: This checks with SGPR to VGPR spilling disabled, but this may ; not work correctly in cases where no workitems take a branch. - -; GCN-LABEL: {{^}}divergent_if_endif: -; VGPR: workitem_private_segment_byte_size = 12{{$}} - - -; GCN: {{^}}; %bb.0: -; GCN: s_mov_b32 m0, -1 -; GCN: ds_read_b32 [[LOAD0:v[0-9]+]] - -; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} - -; Spill saved exec -; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec -; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] - -; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], [[CMP0]] -; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]] - -; GCN: s_cbranch_execz [[ENDIF:.LBB[0-9]+_[0-9]+]] - -; GCN: ; %bb.{{[0-9]+}}: ; %if -; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: s_mov_b32 m0, -1 -; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) - - -; Spill val register -; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] -; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; VMEM: [[ENDIF]]: - -; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload - -; Reload and restore exec mask -; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] -; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 - -; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] - -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 { +; VMEM-LABEL: divergent_if_endif: +; VMEM: .amd_kernel_code_t +; VMEM-NEXT: amd_code_version_major = 1 +; VMEM-NEXT: amd_code_version_minor = 2 +; VMEM-NEXT: amd_machine_kind = 1 +; VMEM-NEXT: amd_machine_version_major = 7 +; VMEM-NEXT: amd_machine_version_minor = 0 +; VMEM-NEXT: amd_machine_version_stepping = 0 +; VMEM-NEXT: kernel_code_entry_byte_offset = 256 +; VMEM-NEXT: kernel_code_prefetch_byte_size = 0 +; VMEM-NEXT: granulated_workitem_vgpr_count = 1 +; VMEM-NEXT: granulated_wavefront_sgpr_count = 1 +; VMEM-NEXT: priority = 0 +; VMEM-NEXT: float_mode = 240 +; VMEM-NEXT: priv = 0 +; VMEM-NEXT: enable_dx10_clamp = 1 +; VMEM-NEXT: debug_mode = 0 +; VMEM-NEXT: enable_ieee_mode = 1 +; VMEM-NEXT: enable_wgp_mode = 0 +; VMEM-NEXT: enable_mem_ordered = 0 +; VMEM-NEXT: enable_fwd_progress = 0 +; VMEM-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; VMEM-NEXT: user_sgpr_count = 6 +; VMEM-NEXT: enable_trap_handler = 0 +; VMEM-NEXT: enable_sgpr_workgroup_id_x = 1 +; VMEM-NEXT: enable_sgpr_workgroup_id_y = 0 +; VMEM-NEXT: enable_sgpr_workgroup_id_z = 0 +; VMEM-NEXT: enable_sgpr_workgroup_info = 0 +; VMEM-NEXT: enable_vgpr_workitem_id = 0 +; VMEM-NEXT: enable_exception_msb = 0 +; VMEM-NEXT: granulated_lds_size = 0 +; VMEM-NEXT: enable_exception = 0 +; VMEM-NEXT: enable_sgpr_private_segment_buffer = 1 +; VMEM-NEXT: enable_sgpr_dispatch_ptr = 0 +; VMEM-NEXT: enable_sgpr_queue_ptr = 0 +; VMEM-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; VMEM-NEXT: enable_sgpr_dispatch_id = 0 +; VMEM-NEXT: enable_sgpr_flat_scratch_init = 0 +; VMEM-NEXT: enable_sgpr_private_segment_size = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; VMEM-NEXT: enable_wavefront_size32 = 0 +; VMEM-NEXT: enable_ordered_append_gds = 0 +; VMEM-NEXT: private_element_size = 1 +; VMEM-NEXT: is_ptr64 = 1 +; VMEM-NEXT: is_dynamic_callstack = 0 +; VMEM-NEXT: is_debug_enabled = 0 +; VMEM-NEXT: is_xnack_enabled = 0 +; VMEM-NEXT: workitem_private_segment_byte_size = 28 +; VMEM-NEXT: workgroup_group_segment_byte_size = 0 +; VMEM-NEXT: gds_segment_byte_size = 0 +; VMEM-NEXT: kernarg_segment_byte_size = 8 +; VMEM-NEXT: workgroup_fbarrier_count = 0 +; VMEM-NEXT: wavefront_sgpr_count = 14 +; VMEM-NEXT: workitem_vgpr_count = 5 +; VMEM-NEXT: reserved_vgpr_first = 0 +; VMEM-NEXT: reserved_vgpr_count = 0 +; VMEM-NEXT: reserved_sgpr_first = 0 +; VMEM-NEXT: reserved_sgpr_count = 0 +; VMEM-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; VMEM-NEXT: debug_private_segment_buffer_sgpr = 0 +; VMEM-NEXT: kernarg_segment_alignment = 4 +; VMEM-NEXT: group_segment_alignment = 4 +; VMEM-NEXT: private_segment_alignment = 4 +; VMEM-NEXT: wavefront_size = 6 +; VMEM-NEXT: call_convention = -1 +; VMEM-NEXT: runtime_loader_kernel_symbol = 0 +; VMEM-NEXT: .end_amd_kernel_code_t +; VMEM-NEXT: ; %bb.0: ; %entry +; VMEM-NEXT: s_add_u32 s0, s0, s7 +; VMEM-NEXT: s_addc_u32 s1, s1, 0 +; VMEM-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VMEM-NEXT: s_mov_b64 s[10:11], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: v_writelane_b32 v2, s4, 0 +; VMEM-NEXT: v_writelane_b32 v2, s5, 1 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[10:11] +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: v_mov_b32_e32 v1, s4 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:24 ; 4-byte Folded Spill +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: s_mov_b32 s4, 0 +; VMEM-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4 +; VMEM-NEXT: v_mov_b32_e32 v0, s4 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20 ; 4-byte Folded Spill +; VMEM-NEXT: s_mov_b64 s[4:5], exec +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VMEM-NEXT: v_writelane_b32 v0, s4, 0 +; VMEM-NEXT: v_writelane_b32 v0, s5, 1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; VMEM-NEXT: s_mov_b64 exec, s[4:5] +; VMEM-NEXT: s_cbranch_execz .LBB0_2 +; VMEM-NEXT: ; %bb.1: ; %if +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:24 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: v_mov_b32_e32 v1, s4 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VMEM-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20 ; 4-byte Folded Spill +; VMEM-NEXT: .LBB0_2: ; %endif +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:20 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s6, v4, 0 +; VMEM-NEXT: v_readlane_b32 s7, v4, 1 +; VMEM-NEXT: buffer_load_dword v4, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_or_b64 exec, exec, s[6:7] +; VMEM-NEXT: s_mov_b64 s[6:7], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s4, v3, 0 +; VMEM-NEXT: v_readlane_b32 s5, v3, 1 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[6:7] +; VMEM-NEXT: v_mov_b32_e32 v0, s4 +; VMEM-NEXT: v_mov_b32_e32 v1, s5 +; VMEM-NEXT: flat_store_dword v[0:1], v2 +; VMEM-NEXT: s_endpgm +; +; VGPR-LABEL: divergent_if_endif: +; VGPR: .amd_kernel_code_t +; VGPR-NEXT: amd_code_version_major = 1 +; VGPR-NEXT: amd_code_version_minor = 2 +; VGPR-NEXT: amd_machine_kind = 1 +; VGPR-NEXT: amd_machine_version_major = 7 +; VGPR-NEXT: amd_machine_version_minor = 0 +; VGPR-NEXT: amd_machine_version_stepping = 0 +; VGPR-NEXT: kernel_code_entry_byte_offset = 256 +; VGPR-NEXT: kernel_code_prefetch_byte_size = 0 +; VGPR-NEXT: granulated_workitem_vgpr_count = 0 +; VGPR-NEXT: granulated_wavefront_sgpr_count = 1 +; VGPR-NEXT: priority = 0 +; VGPR-NEXT: float_mode = 240 +; VGPR-NEXT: priv = 0 +; VGPR-NEXT: enable_dx10_clamp = 1 +; VGPR-NEXT: debug_mode = 0 +; VGPR-NEXT: enable_ieee_mode = 1 +; VGPR-NEXT: enable_wgp_mode = 0 +; VGPR-NEXT: enable_mem_ordered = 0 +; VGPR-NEXT: enable_fwd_progress = 0 +; VGPR-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; VGPR-NEXT: user_sgpr_count = 6 +; VGPR-NEXT: enable_trap_handler = 0 +; VGPR-NEXT: enable_sgpr_workgroup_id_x = 1 +; VGPR-NEXT: enable_sgpr_workgroup_id_y = 0 +; VGPR-NEXT: enable_sgpr_workgroup_id_z = 0 +; VGPR-NEXT: enable_sgpr_workgroup_info = 0 +; VGPR-NEXT: enable_vgpr_workitem_id = 0 +; VGPR-NEXT: enable_exception_msb = 0 +; VGPR-NEXT: granulated_lds_size = 0 +; VGPR-NEXT: enable_exception = 0 +; VGPR-NEXT: enable_sgpr_private_segment_buffer = 1 +; VGPR-NEXT: enable_sgpr_dispatch_ptr = 0 +; VGPR-NEXT: enable_sgpr_queue_ptr = 0 +; VGPR-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; VGPR-NEXT: enable_sgpr_dispatch_id = 0 +; VGPR-NEXT: enable_sgpr_flat_scratch_init = 0 +; VGPR-NEXT: enable_sgpr_private_segment_size = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; VGPR-NEXT: enable_wavefront_size32 = 0 +; VGPR-NEXT: enable_ordered_append_gds = 0 +; VGPR-NEXT: private_element_size = 1 +; VGPR-NEXT: is_ptr64 = 1 +; VGPR-NEXT: is_dynamic_callstack = 0 +; VGPR-NEXT: is_debug_enabled = 0 +; VGPR-NEXT: is_xnack_enabled = 0 +; VGPR-NEXT: workitem_private_segment_byte_size = 12 +; VGPR-NEXT: workgroup_group_segment_byte_size = 0 +; VGPR-NEXT: gds_segment_byte_size = 0 +; VGPR-NEXT: kernarg_segment_byte_size = 8 +; VGPR-NEXT: workgroup_fbarrier_count = 0 +; VGPR-NEXT: wavefront_sgpr_count = 10 +; VGPR-NEXT: workitem_vgpr_count = 4 +; VGPR-NEXT: reserved_vgpr_first = 0 +; VGPR-NEXT: reserved_vgpr_count = 0 +; VGPR-NEXT: reserved_sgpr_first = 0 +; VGPR-NEXT: reserved_sgpr_count = 0 +; VGPR-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; VGPR-NEXT: debug_private_segment_buffer_sgpr = 0 +; VGPR-NEXT: kernarg_segment_alignment = 4 +; VGPR-NEXT: group_segment_alignment = 4 +; VGPR-NEXT: private_segment_alignment = 4 +; VGPR-NEXT: wavefront_size = 6 +; VGPR-NEXT: call_convention = -1 +; VGPR-NEXT: runtime_loader_kernel_symbol = 0 +; VGPR-NEXT: .end_amd_kernel_code_t +; VGPR-NEXT: ; %bb.0: ; %entry +; VGPR-NEXT: s_add_u32 s0, s0, s7 +; VGPR-NEXT: s_addc_u32 s1, s1, 0 +; VGPR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_writelane_b32 v1, s4, 0 +; VGPR-NEXT: v_writelane_b32 v1, s5, 1 +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: s_mov_b32 s4, 0 +; VGPR-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4 +; VGPR-NEXT: v_mov_b32_e32 v0, s4 +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VGPR-NEXT: s_mov_b64 s[4:5], exec +; VGPR-NEXT: v_writelane_b32 v1, s4, 2 +; VGPR-NEXT: v_writelane_b32 v1, s5, 3 +; VGPR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; VGPR-NEXT: s_mov_b64 exec, s[4:5] +; VGPR-NEXT: s_cbranch_execz .LBB0_2 +; VGPR-NEXT: ; %bb.1: ; %if +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VGPR-NEXT: .LBB0_2: ; %endif +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VGPR-NEXT: v_readlane_b32 s6, v1, 2 +; VGPR-NEXT: v_readlane_b32 s7, v1, 3 +; VGPR-NEXT: s_or_b64 exec, exec, s[6:7] +; VGPR-NEXT: v_readlane_b32 s4, v1, 0 +; VGPR-NEXT: v_readlane_b32 s5, v1, 1 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: v_mov_b32_e32 v3, s5 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: flat_store_dword v[2:3], v0 +; VGPR-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, ptr addrspace(3) undef @@ -81,59 +287,317 @@ ret void } -; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 16{{$}} - -; GCN: {{^}}; %bb.0: -; GCN-DAG: s_mov_b32 m0, -1 -; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} -; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} - -; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec - -; Spill saved exec -; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] - -; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill - - -; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] -; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]] -; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]] - - -; GCN: [[LOOP:.LBB[0-9]+_[0-9]+]]: -; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] -; GCN: s_cmp_lg_u32 -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; VMEM: buffer_store_dword -; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN-NEXT: s_cbranch_scc1 [[LOOP]] - -; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; GCN: [[END]]: -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload -; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] -; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 - -; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] - -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 { +; VMEM-LABEL: divergent_loop: +; VMEM: .amd_kernel_code_t +; VMEM-NEXT: amd_code_version_major = 1 +; VMEM-NEXT: amd_code_version_minor = 2 +; VMEM-NEXT: amd_machine_kind = 1 +; VMEM-NEXT: amd_machine_version_major = 7 +; VMEM-NEXT: amd_machine_version_minor = 0 +; VMEM-NEXT: amd_machine_version_stepping = 0 +; VMEM-NEXT: kernel_code_entry_byte_offset = 256 +; VMEM-NEXT: kernel_code_prefetch_byte_size = 0 +; VMEM-NEXT: granulated_workitem_vgpr_count = 1 +; VMEM-NEXT: granulated_wavefront_sgpr_count = 1 +; VMEM-NEXT: priority = 0 +; VMEM-NEXT: float_mode = 240 +; VMEM-NEXT: priv = 0 +; VMEM-NEXT: enable_dx10_clamp = 1 +; VMEM-NEXT: debug_mode = 0 +; VMEM-NEXT: enable_ieee_mode = 1 +; VMEM-NEXT: enable_wgp_mode = 0 +; VMEM-NEXT: enable_mem_ordered = 0 +; VMEM-NEXT: enable_fwd_progress = 0 +; VMEM-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; VMEM-NEXT: user_sgpr_count = 6 +; VMEM-NEXT: enable_trap_handler = 0 +; VMEM-NEXT: enable_sgpr_workgroup_id_x = 1 +; VMEM-NEXT: enable_sgpr_workgroup_id_y = 0 +; VMEM-NEXT: enable_sgpr_workgroup_id_z = 0 +; VMEM-NEXT: enable_sgpr_workgroup_info = 0 +; VMEM-NEXT: enable_vgpr_workitem_id = 0 +; VMEM-NEXT: enable_exception_msb = 0 +; VMEM-NEXT: granulated_lds_size = 0 +; VMEM-NEXT: enable_exception = 0 +; VMEM-NEXT: enable_sgpr_private_segment_buffer = 1 +; VMEM-NEXT: enable_sgpr_dispatch_ptr = 0 +; VMEM-NEXT: enable_sgpr_queue_ptr = 0 +; VMEM-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; VMEM-NEXT: enable_sgpr_dispatch_id = 0 +; VMEM-NEXT: enable_sgpr_flat_scratch_init = 0 +; VMEM-NEXT: enable_sgpr_private_segment_size = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; VMEM-NEXT: enable_wavefront_size32 = 0 +; VMEM-NEXT: enable_ordered_append_gds = 0 +; VMEM-NEXT: private_element_size = 1 +; VMEM-NEXT: is_ptr64 = 1 +; VMEM-NEXT: is_dynamic_callstack = 0 +; VMEM-NEXT: is_debug_enabled = 0 +; VMEM-NEXT: is_xnack_enabled = 0 +; VMEM-NEXT: workitem_private_segment_byte_size = 36 +; VMEM-NEXT: workgroup_group_segment_byte_size = 0 +; VMEM-NEXT: gds_segment_byte_size = 0 +; VMEM-NEXT: kernarg_segment_byte_size = 8 +; VMEM-NEXT: workgroup_fbarrier_count = 0 +; VMEM-NEXT: wavefront_sgpr_count = 16 +; VMEM-NEXT: workitem_vgpr_count = 5 +; VMEM-NEXT: reserved_vgpr_first = 0 +; VMEM-NEXT: reserved_vgpr_count = 0 +; VMEM-NEXT: reserved_sgpr_first = 0 +; VMEM-NEXT: reserved_sgpr_count = 0 +; VMEM-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; VMEM-NEXT: debug_private_segment_buffer_sgpr = 0 +; VMEM-NEXT: kernarg_segment_alignment = 4 +; VMEM-NEXT: group_segment_alignment = 4 +; VMEM-NEXT: private_segment_alignment = 4 +; VMEM-NEXT: wavefront_size = 6 +; VMEM-NEXT: call_convention = -1 +; VMEM-NEXT: runtime_loader_kernel_symbol = 0 +; VMEM-NEXT: .end_amd_kernel_code_t +; VMEM-NEXT: ; %bb.0: ; %entry +; VMEM-NEXT: s_add_u32 s0, s0, s7 +; VMEM-NEXT: s_addc_u32 s1, s1, 0 +; VMEM-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VMEM-NEXT: s_mov_b64 s[12:13], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: v_writelane_b32 v3, s4, 0 +; VMEM-NEXT: v_writelane_b32 v3, s5, 1 +; VMEM-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[12:13] +; VMEM-NEXT: v_mov_b32_e32 v1, 0 +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: s_mov_b32 s4, 0 +; VMEM-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4 +; VMEM-NEXT: v_mov_b32_e32 v0, s4 +; VMEM-NEXT: s_mov_b64 s[10:11], exec +; VMEM-NEXT: s_mov_b64 exec, 1 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VMEM-NEXT: v_writelane_b32 v2, s4, 0 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[10:11] +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:28 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24 ; 4-byte Folded Spill +; VMEM-NEXT: s_mov_b64 s[4:5], exec +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VMEM-NEXT: v_writelane_b32 v0, s4, 0 +; VMEM-NEXT: v_writelane_b32 v0, s5, 1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; VMEM-NEXT: s_mov_b64 exec, s[4:5] +; VMEM-NEXT: s_cbranch_execz .LBB1_3 +; VMEM-NEXT: .LBB1_1: ; %loop +; VMEM-NEXT: ; =>This Inner Loop Header: Depth=1 +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:28 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 1 +; VMEM-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s5, v3, 0 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: v_mov_b32_e32 v1, s4 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: s_mov_b32 s4, 1 +; VMEM-NEXT: s_add_i32 s4, s5, s4 +; VMEM-NEXT: ; implicit-def: $sgpr6 +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; VMEM-NEXT: s_mov_b32 s6, 0x100 +; VMEM-NEXT: s_cmp_lg_u32 s5, s6 +; VMEM-NEXT: v_mov_b32_e32 v1, v0 +; VMEM-NEXT: s_mov_b64 s[6:7], exec +; VMEM-NEXT: s_mov_b64 exec, 1 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VMEM-NEXT: v_writelane_b32 v2, s4, 0 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[6:7] +; VMEM-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:28 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:32 ; 4-byte Folded Spill +; VMEM-NEXT: s_cbranch_scc1 .LBB1_1 +; VMEM-NEXT: ; %bb.2: ; %Flow +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:32 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:24 ; 4-byte Folded Spill +; VMEM-NEXT: .LBB1_3: ; %end +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:24 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s6, v4, 0 +; VMEM-NEXT: v_readlane_b32 s7, v4, 1 +; VMEM-NEXT: buffer_load_dword v4, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_or_b64 exec, exec, s[6:7] +; VMEM-NEXT: s_mov_b64 s[6:7], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s4, v3, 0 +; VMEM-NEXT: v_readlane_b32 s5, v3, 1 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[6:7] +; VMEM-NEXT: v_mov_b32_e32 v0, s4 +; VMEM-NEXT: v_mov_b32_e32 v1, s5 +; VMEM-NEXT: flat_store_dword v[0:1], v2 +; VMEM-NEXT: s_endpgm +; +; VGPR-LABEL: divergent_loop: +; VGPR: .amd_kernel_code_t +; VGPR-NEXT: amd_code_version_major = 1 +; VGPR-NEXT: amd_code_version_minor = 2 +; VGPR-NEXT: amd_machine_kind = 1 +; VGPR-NEXT: amd_machine_version_major = 7 +; VGPR-NEXT: amd_machine_version_minor = 0 +; VGPR-NEXT: amd_machine_version_stepping = 0 +; VGPR-NEXT: kernel_code_entry_byte_offset = 256 +; VGPR-NEXT: kernel_code_prefetch_byte_size = 0 +; VGPR-NEXT: granulated_workitem_vgpr_count = 0 +; VGPR-NEXT: granulated_wavefront_sgpr_count = 1 +; VGPR-NEXT: priority = 0 +; VGPR-NEXT: float_mode = 240 +; VGPR-NEXT: priv = 0 +; VGPR-NEXT: enable_dx10_clamp = 1 +; VGPR-NEXT: debug_mode = 0 +; VGPR-NEXT: enable_ieee_mode = 1 +; VGPR-NEXT: enable_wgp_mode = 0 +; VGPR-NEXT: enable_mem_ordered = 0 +; VGPR-NEXT: enable_fwd_progress = 0 +; VGPR-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; VGPR-NEXT: user_sgpr_count = 6 +; VGPR-NEXT: enable_trap_handler = 0 +; VGPR-NEXT: enable_sgpr_workgroup_id_x = 1 +; VGPR-NEXT: enable_sgpr_workgroup_id_y = 0 +; VGPR-NEXT: enable_sgpr_workgroup_id_z = 0 +; VGPR-NEXT: enable_sgpr_workgroup_info = 0 +; VGPR-NEXT: enable_vgpr_workitem_id = 0 +; VGPR-NEXT: enable_exception_msb = 0 +; VGPR-NEXT: granulated_lds_size = 0 +; VGPR-NEXT: enable_exception = 0 +; VGPR-NEXT: enable_sgpr_private_segment_buffer = 1 +; VGPR-NEXT: enable_sgpr_dispatch_ptr = 0 +; VGPR-NEXT: enable_sgpr_queue_ptr = 0 +; VGPR-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; VGPR-NEXT: enable_sgpr_dispatch_id = 0 +; VGPR-NEXT: enable_sgpr_flat_scratch_init = 0 +; VGPR-NEXT: enable_sgpr_private_segment_size = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; VGPR-NEXT: enable_wavefront_size32 = 0 +; VGPR-NEXT: enable_ordered_append_gds = 0 +; VGPR-NEXT: private_element_size = 1 +; VGPR-NEXT: is_ptr64 = 1 +; VGPR-NEXT: is_dynamic_callstack = 0 +; VGPR-NEXT: is_debug_enabled = 0 +; VGPR-NEXT: is_xnack_enabled = 0 +; VGPR-NEXT: workitem_private_segment_byte_size = 16 +; VGPR-NEXT: workgroup_group_segment_byte_size = 0 +; VGPR-NEXT: gds_segment_byte_size = 0 +; VGPR-NEXT: kernarg_segment_byte_size = 8 +; VGPR-NEXT: workgroup_fbarrier_count = 0 +; VGPR-NEXT: wavefront_sgpr_count = 10 +; VGPR-NEXT: workitem_vgpr_count = 4 +; VGPR-NEXT: reserved_vgpr_first = 0 +; VGPR-NEXT: reserved_vgpr_count = 0 +; VGPR-NEXT: reserved_sgpr_first = 0 +; VGPR-NEXT: reserved_sgpr_count = 0 +; VGPR-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; VGPR-NEXT: debug_private_segment_buffer_sgpr = 0 +; VGPR-NEXT: kernarg_segment_alignment = 4 +; VGPR-NEXT: group_segment_alignment = 4 +; VGPR-NEXT: private_segment_alignment = 4 +; VGPR-NEXT: wavefront_size = 6 +; VGPR-NEXT: call_convention = -1 +; VGPR-NEXT: runtime_loader_kernel_symbol = 0 +; VGPR-NEXT: .end_amd_kernel_code_t +; VGPR-NEXT: ; %bb.0: ; %entry +; VGPR-NEXT: s_add_u32 s0, s0, s7 +; VGPR-NEXT: s_addc_u32 s1, s1, 0 +; VGPR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_writelane_b32 v1, s4, 0 +; VGPR-NEXT: v_writelane_b32 v1, s5, 1 +; VGPR-NEXT: v_mov_b32_e32 v2, 0 +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: s_mov_b32 s4, 0 +; VGPR-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4 +; VGPR-NEXT: v_mov_b32_e32 v0, s4 +; VGPR-NEXT: v_writelane_b32 v1, s4, 2 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VGPR-NEXT: s_mov_b64 s[4:5], exec +; VGPR-NEXT: v_writelane_b32 v1, s4, 3 +; VGPR-NEXT: v_writelane_b32 v1, s5, 4 +; VGPR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; VGPR-NEXT: s_mov_b64 exec, s[4:5] +; VGPR-NEXT: s_cbranch_execz .LBB1_3 +; VGPR-NEXT: .LBB1_1: ; %loop +; VGPR-NEXT: ; =>This Inner Loop Header: Depth=1 +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; VGPR-NEXT: v_readlane_b32 s5, v1, 2 +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: s_mov_b32 s4, 1 +; VGPR-NEXT: s_add_i32 s4, s5, s4 +; VGPR-NEXT: ; implicit-def: $sgpr6 +; VGPR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; VGPR-NEXT: s_mov_b32 s6, 0x100 +; VGPR-NEXT: s_cmp_lg_u32 s5, s6 +; VGPR-NEXT: v_mov_b32_e32 v2, v0 +; VGPR-NEXT: v_writelane_b32 v1, s4, 2 +; VGPR-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VGPR-NEXT: s_cbranch_scc1 .LBB1_1 +; VGPR-NEXT: ; %bb.2: ; %Flow +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 ; 4-byte Folded Reload +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VGPR-NEXT: .LBB1_3: ; %end +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VGPR-NEXT: v_readlane_b32 s6, v1, 3 +; VGPR-NEXT: v_readlane_b32 s7, v1, 4 +; VGPR-NEXT: s_or_b64 exec, exec, s[6:7] +; VGPR-NEXT: v_readlane_b32 s4, v1, 0 +; VGPR-NEXT: v_readlane_b32 s5, v1, 1 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: v_mov_b32_e32 v3, s5 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: flat_store_dword v[2:3], v0 +; VGPR-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, ptr addrspace(3) null @@ -155,95 +619,333 @@ ret void } -; GCN-LABEL: {{^}}divergent_if_else_endif: -; GCN: {{^}}; %bb.0: - -; GCN-DAG: s_mov_b32 m0, -1 -; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} -; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] - -; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] - -; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec -; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] -; GCN: s_xor_b64 s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]] - -; Spill saved exec -; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] - -; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 -; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; GCN: s_mov_b64 exec, [[CMP0]] - -; FIXME: It makes no sense to put this skip here -; GCN: s_cbranch_execz [[FLOW:.LBB[0-9]+_[0-9]+]] -; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] - -; GCN: [[FLOW]]: ; %Flow -; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload -; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] -; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] - -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 - -; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]] - -; Regular spill value restored after exec modification -; Followed by spill -; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; GCN: s_and_b64 s[[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]], exec, s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]] - -; Spill saved exec -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] -; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] - -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0 -; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1 -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill - -; GCN: s_xor_b64 exec, exec, s[[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]] -; GCN-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9]+_[0-9]+]] - - -; GCN: ; %bb.{{[0-9]+}}: ; %if -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: ds_read_b32 -; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill -; GCN-NEXT: s_branch [[ENDIF:.LBB[0-9]+_[0-9]+]] - -; GCN: [[ELSE]]: ; %else -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload -; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN-NEXT: s_branch [[FLOW]] - -; GCN: [[ENDIF]]: -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload -; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] -; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] - - -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload -; VMEM: s_waitcnt vmcnt(0) -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 -; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 - -; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] - -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 { +; VMEM-LABEL: divergent_if_else_endif: +; VMEM: .amd_kernel_code_t +; VMEM-NEXT: amd_code_version_major = 1 +; VMEM-NEXT: amd_code_version_minor = 2 +; VMEM-NEXT: amd_machine_kind = 1 +; VMEM-NEXT: amd_machine_version_major = 7 +; VMEM-NEXT: amd_machine_version_minor = 0 +; VMEM-NEXT: amd_machine_version_stepping = 0 +; VMEM-NEXT: kernel_code_entry_byte_offset = 256 +; VMEM-NEXT: kernel_code_prefetch_byte_size = 0 +; VMEM-NEXT: granulated_workitem_vgpr_count = 1 +; VMEM-NEXT: granulated_wavefront_sgpr_count = 1 +; VMEM-NEXT: priority = 0 +; VMEM-NEXT: float_mode = 240 +; VMEM-NEXT: priv = 0 +; VMEM-NEXT: enable_dx10_clamp = 1 +; VMEM-NEXT: debug_mode = 0 +; VMEM-NEXT: enable_ieee_mode = 1 +; VMEM-NEXT: enable_wgp_mode = 0 +; VMEM-NEXT: enable_mem_ordered = 0 +; VMEM-NEXT: enable_fwd_progress = 0 +; VMEM-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; VMEM-NEXT: user_sgpr_count = 6 +; VMEM-NEXT: enable_trap_handler = 0 +; VMEM-NEXT: enable_sgpr_workgroup_id_x = 1 +; VMEM-NEXT: enable_sgpr_workgroup_id_y = 0 +; VMEM-NEXT: enable_sgpr_workgroup_id_z = 0 +; VMEM-NEXT: enable_sgpr_workgroup_info = 0 +; VMEM-NEXT: enable_vgpr_workitem_id = 0 +; VMEM-NEXT: enable_exception_msb = 0 +; VMEM-NEXT: granulated_lds_size = 0 +; VMEM-NEXT: enable_exception = 0 +; VMEM-NEXT: enable_sgpr_private_segment_buffer = 1 +; VMEM-NEXT: enable_sgpr_dispatch_ptr = 0 +; VMEM-NEXT: enable_sgpr_queue_ptr = 0 +; VMEM-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; VMEM-NEXT: enable_sgpr_dispatch_id = 0 +; VMEM-NEXT: enable_sgpr_flat_scratch_init = 0 +; VMEM-NEXT: enable_sgpr_private_segment_size = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; VMEM-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; VMEM-NEXT: enable_wavefront_size32 = 0 +; VMEM-NEXT: enable_ordered_append_gds = 0 +; VMEM-NEXT: private_element_size = 1 +; VMEM-NEXT: is_ptr64 = 1 +; VMEM-NEXT: is_dynamic_callstack = 0 +; VMEM-NEXT: is_debug_enabled = 0 +; VMEM-NEXT: is_xnack_enabled = 0 +; VMEM-NEXT: workitem_private_segment_byte_size = 40 +; VMEM-NEXT: workgroup_group_segment_byte_size = 0 +; VMEM-NEXT: gds_segment_byte_size = 0 +; VMEM-NEXT: kernarg_segment_byte_size = 8 +; VMEM-NEXT: workgroup_fbarrier_count = 0 +; VMEM-NEXT: wavefront_sgpr_count = 14 +; VMEM-NEXT: workitem_vgpr_count = 5 +; VMEM-NEXT: reserved_vgpr_first = 0 +; VMEM-NEXT: reserved_vgpr_count = 0 +; VMEM-NEXT: reserved_sgpr_first = 0 +; VMEM-NEXT: reserved_sgpr_count = 0 +; VMEM-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; VMEM-NEXT: debug_private_segment_buffer_sgpr = 0 +; VMEM-NEXT: kernarg_segment_alignment = 4 +; VMEM-NEXT: group_segment_alignment = 4 +; VMEM-NEXT: private_segment_alignment = 4 +; VMEM-NEXT: wavefront_size = 6 +; VMEM-NEXT: call_convention = -1 +; VMEM-NEXT: runtime_loader_kernel_symbol = 0 +; VMEM-NEXT: .end_amd_kernel_code_t +; VMEM-NEXT: ; %bb.0: ; %entry +; VMEM-NEXT: s_add_u32 s0, s0, s7 +; VMEM-NEXT: s_addc_u32 s1, s1, 0 +; VMEM-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VMEM-NEXT: s_mov_b64 s[10:11], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: v_writelane_b32 v2, s4, 0 +; VMEM-NEXT: v_writelane_b32 v2, s5, 1 +; VMEM-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[10:11] +; VMEM-NEXT: v_mov_b32_e32 v1, 0 +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: s_waitcnt lgkmcnt(0) +; VMEM-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:32 ; 4-byte Folded Spill +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: s_mov_b32 s4, 0 +; VMEM-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 +; VMEM-NEXT: ; implicit-def: $sgpr6 +; VMEM-NEXT: v_mov_b32_e32 v0, s6 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28 ; 4-byte Folded Spill +; VMEM-NEXT: s_mov_b64 s[6:7], exec +; VMEM-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; VMEM-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VMEM-NEXT: v_writelane_b32 v0, s6, 0 +; VMEM-NEXT: v_writelane_b32 v0, s7, 1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_mov_b64 exec, s[4:5] +; VMEM-NEXT: s_cbranch_execz .LBB2_1 +; VMEM-NEXT: s_branch .LBB2_3 +; VMEM-NEXT: .LBB2_1: ; %Flow +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:28 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s4, v1, 0 +; VMEM-NEXT: v_readlane_b32 s5, v1, 1 +; VMEM-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36 ; 4-byte Folded Spill +; VMEM-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; VMEM-NEXT: s_mov_b64 s[6:7], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VMEM-NEXT: v_writelane_b32 v0, s4, 0 +; VMEM-NEXT: v_writelane_b32 v0, s5, 1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:20 ; 4-byte Folded Spill +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[6:7] +; VMEM-NEXT: s_xor_b64 exec, exec, s[4:5] +; VMEM-NEXT: s_cbranch_execz .LBB2_4 +; VMEM-NEXT: ; %bb.2: ; %if +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:32 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: v_mov_b32_e32 v1, s4 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VMEM-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:36 ; 4-byte Folded Spill +; VMEM-NEXT: s_branch .LBB2_4 +; VMEM-NEXT: .LBB2_3: ; %else +; VMEM-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:32 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b32 m0, -1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: v_mov_b32_e32 v1, s4 +; VMEM-NEXT: ds_read_b32 v1, v1 +; VMEM-NEXT: ; implicit-def: $sgpr4 +; VMEM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VMEM-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; VMEM-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:28 ; 4-byte Folded Spill +; VMEM-NEXT: s_branch .LBB2_1 +; VMEM-NEXT: .LBB2_4: ; %endif +; VMEM-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:36 ; 4-byte Folded Reload +; VMEM-NEXT: s_mov_b64 s[8:9], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:20 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s6, v4, 0 +; VMEM-NEXT: v_readlane_b32 s7, v4, 1 +; VMEM-NEXT: buffer_load_dword v4, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[8:9] +; VMEM-NEXT: s_or_b64 exec, exec, s[6:7] +; VMEM-NEXT: s_mov_b64 s[6:7], exec +; VMEM-NEXT: s_mov_b64 exec, 3 +; VMEM-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12 ; 4-byte Folded Reload +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: v_readlane_b32 s4, v3, 0 +; VMEM-NEXT: v_readlane_b32 s5, v3, 1 +; VMEM-NEXT: buffer_load_dword v3, off, s[0:3], 0 +; VMEM-NEXT: s_waitcnt vmcnt(0) +; VMEM-NEXT: s_mov_b64 exec, s[6:7] +; VMEM-NEXT: v_mov_b32_e32 v0, s4 +; VMEM-NEXT: v_mov_b32_e32 v1, s5 +; VMEM-NEXT: flat_store_dword v[0:1], v2 +; VMEM-NEXT: s_endpgm +; +; VGPR-LABEL: divergent_if_else_endif: +; VGPR: .amd_kernel_code_t +; VGPR-NEXT: amd_code_version_major = 1 +; VGPR-NEXT: amd_code_version_minor = 2 +; VGPR-NEXT: amd_machine_kind = 1 +; VGPR-NEXT: amd_machine_version_major = 7 +; VGPR-NEXT: amd_machine_version_minor = 0 +; VGPR-NEXT: amd_machine_version_stepping = 0 +; VGPR-NEXT: kernel_code_entry_byte_offset = 256 +; VGPR-NEXT: kernel_code_prefetch_byte_size = 0 +; VGPR-NEXT: granulated_workitem_vgpr_count = 0 +; VGPR-NEXT: granulated_wavefront_sgpr_count = 1 +; VGPR-NEXT: priority = 0 +; VGPR-NEXT: float_mode = 240 +; VGPR-NEXT: priv = 0 +; VGPR-NEXT: enable_dx10_clamp = 1 +; VGPR-NEXT: debug_mode = 0 +; VGPR-NEXT: enable_ieee_mode = 1 +; VGPR-NEXT: enable_wgp_mode = 0 +; VGPR-NEXT: enable_mem_ordered = 0 +; VGPR-NEXT: enable_fwd_progress = 0 +; VGPR-NEXT: enable_sgpr_private_segment_wave_byte_offset = 1 +; VGPR-NEXT: user_sgpr_count = 6 +; VGPR-NEXT: enable_trap_handler = 0 +; VGPR-NEXT: enable_sgpr_workgroup_id_x = 1 +; VGPR-NEXT: enable_sgpr_workgroup_id_y = 0 +; VGPR-NEXT: enable_sgpr_workgroup_id_z = 0 +; VGPR-NEXT: enable_sgpr_workgroup_info = 0 +; VGPR-NEXT: enable_vgpr_workitem_id = 0 +; VGPR-NEXT: enable_exception_msb = 0 +; VGPR-NEXT: granulated_lds_size = 0 +; VGPR-NEXT: enable_exception = 0 +; VGPR-NEXT: enable_sgpr_private_segment_buffer = 1 +; VGPR-NEXT: enable_sgpr_dispatch_ptr = 0 +; VGPR-NEXT: enable_sgpr_queue_ptr = 0 +; VGPR-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; VGPR-NEXT: enable_sgpr_dispatch_id = 0 +; VGPR-NEXT: enable_sgpr_flat_scratch_init = 0 +; VGPR-NEXT: enable_sgpr_private_segment_size = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; VGPR-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; VGPR-NEXT: enable_wavefront_size32 = 0 +; VGPR-NEXT: enable_ordered_append_gds = 0 +; VGPR-NEXT: private_element_size = 1 +; VGPR-NEXT: is_ptr64 = 1 +; VGPR-NEXT: is_dynamic_callstack = 0 +; VGPR-NEXT: is_debug_enabled = 0 +; VGPR-NEXT: is_xnack_enabled = 0 +; VGPR-NEXT: workitem_private_segment_byte_size = 16 +; VGPR-NEXT: workgroup_group_segment_byte_size = 0 +; VGPR-NEXT: gds_segment_byte_size = 0 +; VGPR-NEXT: kernarg_segment_byte_size = 8 +; VGPR-NEXT: workgroup_fbarrier_count = 0 +; VGPR-NEXT: wavefront_sgpr_count = 10 +; VGPR-NEXT: workitem_vgpr_count = 4 +; VGPR-NEXT: reserved_vgpr_first = 0 +; VGPR-NEXT: reserved_vgpr_count = 0 +; VGPR-NEXT: reserved_sgpr_first = 0 +; VGPR-NEXT: reserved_sgpr_count = 0 +; VGPR-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; VGPR-NEXT: debug_private_segment_buffer_sgpr = 0 +; VGPR-NEXT: kernarg_segment_alignment = 4 +; VGPR-NEXT: group_segment_alignment = 4 +; VGPR-NEXT: private_segment_alignment = 4 +; VGPR-NEXT: wavefront_size = 6 +; VGPR-NEXT: call_convention = -1 +; VGPR-NEXT: runtime_loader_kernel_symbol = 0 +; VGPR-NEXT: .end_amd_kernel_code_t +; VGPR-NEXT: ; %bb.0: ; %entry +; VGPR-NEXT: s_add_u32 s0, s0, s7 +; VGPR-NEXT: s_addc_u32 s1, s1, 0 +; VGPR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_writelane_b32 v1, s4, 0 +; VGPR-NEXT: v_writelane_b32 v1, s5, 1 +; VGPR-NEXT: v_mov_b32_e32 v2, 0 +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: s_mov_b32 s4, 0 +; VGPR-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 +; VGPR-NEXT: ; implicit-def: $sgpr6 +; VGPR-NEXT: v_mov_b32_e32 v0, s6 +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VGPR-NEXT: s_mov_b64 s[6:7], exec +; VGPR-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; VGPR-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; VGPR-NEXT: v_writelane_b32 v1, s6, 2 +; VGPR-NEXT: v_writelane_b32 v1, s7, 3 +; VGPR-NEXT: s_mov_b64 exec, s[4:5] +; VGPR-NEXT: s_cbranch_execz .LBB2_1 +; VGPR-NEXT: s_branch .LBB2_3 +; VGPR-NEXT: .LBB2_1: ; %Flow +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; VGPR-NEXT: v_readlane_b32 s4, v1, 2 +; VGPR-NEXT: v_readlane_b32 s5, v1, 3 +; VGPR-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VGPR-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; VGPR-NEXT: v_writelane_b32 v1, s4, 4 +; VGPR-NEXT: v_writelane_b32 v1, s5, 5 +; VGPR-NEXT: s_xor_b64 exec, exec, s[4:5] +; VGPR-NEXT: s_cbranch_execz .LBB2_4 +; VGPR-NEXT: ; %bb.2: ; %if +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; 4-byte Folded Spill +; VGPR-NEXT: s_branch .LBB2_4 +; VGPR-NEXT: .LBB2_3: ; %else +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; VGPR-NEXT: s_mov_b32 m0, -1 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: ds_read_b32 v2, v2 +; VGPR-NEXT: ; implicit-def: $sgpr4 +; VGPR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; VGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill +; VGPR-NEXT: s_branch .LBB2_1 +; VGPR-NEXT: .LBB2_4: ; %endif +; VGPR-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 ; 4-byte Folded Reload +; VGPR-NEXT: v_readlane_b32 s6, v1, 4 +; VGPR-NEXT: v_readlane_b32 s7, v1, 5 +; VGPR-NEXT: s_or_b64 exec, exec, s[6:7] +; VGPR-NEXT: v_readlane_b32 s4, v1, 0 +; VGPR-NEXT: v_readlane_b32 s5, v1, 1 +; VGPR-NEXT: v_mov_b32_e32 v2, s4 +; VGPR-NEXT: v_mov_b32_e32 v3, s5 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: flat_store_dword v[2:3], v0 +; VGPR-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %load0 = load volatile i32, ptr addrspace(3) null Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -942,7 +942,7 @@ ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll +++ llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll @@ -30,12 +30,12 @@ ; REVERSEXNACK-LABEL: shuffle_v4f16_234u: ; REVERSEXNACK: ; %bb.0: ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2 -; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4 -; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off +; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v3 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v1 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v0 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v2 +; REVERSEXNACK-NEXT: global_load_dword v0, v[3:4], off offset:4 +; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[5:6], off ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) ; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll @@ -1,9 +1,35 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2s %s ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg: ; GCN: v_bfe_i32 define amdgpu_kernel void @bfe_i32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 { +; GCN1-LABEL: bfe_i32_arg_arg_arg: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_bfe_i32 v0, v0, s3, s3 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_arg_arg_arg: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: v_mov_b32_e32 v0, s2 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: v_bfe_i32 v0, v0, s3, s3 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -12,6 +38,33 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm: ; GCN: v_bfe_i32 define amdgpu_kernel void @bfe_i32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { +; GCN1-LABEL: bfe_i32_arg_arg_imm: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0x7b +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_bfe_i32 v0, s2, v1, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_arg_arg_imm: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: v_mov_b32_e32 v1, 0x7b +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: v_mov_b32_e32 v0, s3 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: v_bfe_i32 v0, s2, v0, v1 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -20,6 +73,33 @@ ; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg: ; GCN: v_bfe_i32 define amdgpu_kernel void @bfe_i32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 { +; GCN1-LABEL: bfe_i32_arg_imm_arg: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0x7b +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_bfe_i32 v0, s2, v0, v1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_arg_imm_arg: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0x7b +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: v_mov_b32_e32 v1, s3 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: v_bfe_i32 v0, s2, v0, v1 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -28,6 +108,35 @@ ; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg: ; GCN: v_bfe_i32 define amdgpu_kernel void @bfe_i32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 { +; GCN1-LABEL: bfe_i32_imm_arg_arg: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_movk_i32 s8, 0x7b +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_bfe_i32 v0, s8, v0, v1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_imm_arg_arg: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_movk_i32 s8, 0x7b +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: v_mov_b32_e32 v0, s2 +; GCN2s-NEXT: v_mov_b32_e32 v1, s3 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: v_bfe_i32 v0, s8, v0, v1 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -36,6 +145,41 @@ ; GCN-LABEL: {{^}}v_bfe_print_arg: ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8 define amdgpu_kernel void @v_bfe_print_arg(ptr addrspace(1) %out, ptr addrspace(1) %src0) #0 { +; GCN1-LABEL: v_bfe_print_arg: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 2, 8 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: v_bfe_print_arg: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 2, 8 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %src0, align 4 %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 @@ -46,6 +190,25 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { +; GCN1-LABEL: bfe_i32_arg_0_width_reg_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_arg_0_width_reg_offset: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0) store i32 %bfe_u32, ptr addrspace(1) %out, align 4 ret void @@ -55,6 +218,25 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 { +; GCN1-LABEL: bfe_i32_arg_0_width_imm_offset: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_arg_0_width_imm_offset: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0) store i32 %bfe_u32, ptr addrspace(1) %out, align 4 ret void @@ -65,6 +247,43 @@ ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_6: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 1, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_6: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 1, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31) @@ -79,6 +298,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_7: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_7: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31) @@ -91,6 +329,41 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_8: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_8: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = shl i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) @@ -104,6 +377,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_9: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_9: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1) store i32 %bfe, ptr addrspace(1) %out, align 4 @@ -116,6 +424,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_10: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 1, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_10: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 1, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31) store i32 %bfe, ptr addrspace(1) %out, align 4 @@ -128,6 +471,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_11: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_11: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 8, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24) store i32 %bfe, ptr addrspace(1) %out, align 4 @@ -140,6 +518,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_12: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_12: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8) store i32 %bfe, ptr addrspace(1) %out, align 4 @@ -151,6 +564,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_13: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_13: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = ashr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) @@ -162,6 +610,25 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_i32_test_14: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_test_14: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = lshr i32 %x, 31 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1) @@ -174,6 +641,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_0(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_0: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_0: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -185,6 +671,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_1(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_1: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_1: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -196,6 +701,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_2(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_2: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_2: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -207,6 +731,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_3(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_3: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_3: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -218,6 +761,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_4(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_4: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_4: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -229,6 +791,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_5(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_5: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_5: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -240,6 +821,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_6(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_6: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0xffffff80 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_6: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0xffffff80 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -251,6 +851,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_7(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_7: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0x7f +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_7: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0x7f +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -262,6 +881,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_8(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_8: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_8: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -273,6 +911,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_9(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_9: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_9: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -284,6 +941,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_10(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_10: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_10: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -295,6 +971,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_11(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_11: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, -6 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_11: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, -6 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -306,6 +1001,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_12(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_12: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_12: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -317,6 +1031,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_13(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_13: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_13: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -328,6 +1061,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_14(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_14: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 40 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_14: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 40 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -339,6 +1091,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_15(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_15: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 10 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_15: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 10 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -350,6 +1121,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_16(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_16: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -361,6 +1151,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_17(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_17: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0x7f +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_17: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0x7f +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -372,6 +1181,25 @@ ; GCN: buffer_store_dword [[VREG]], ; GCN: s_endpgm define amdgpu_kernel void @bfe_i32_constant_fold_test_18(ptr addrspace(1) %out) #0 { +; GCN1-LABEL: bfe_i32_constant_fold_test_18: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_i32_constant_fold_test_18: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1) store i32 %bfe_i32, ptr addrspace(1) %out, align 4 ret void @@ -384,6 +1212,41 @@ ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24 ; GCN: buffer_store_dword [[BFE]], define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: bfe_sext_in_reg_i24: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_sext_in_reg_i24: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24) %shl = shl i32 %bfe, 8 @@ -400,6 +1263,47 @@ ; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]] ; GCN: buffer_store_dword [[TMP2]] define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: simplify_demanded_bfe_sdiv: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 1, 16 +; GCN1-NEXT: v_lshrrev_b32_e32 v1, 31, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN1-NEXT: v_ashrrev_i32_e32 v0, 1, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: simplify_demanded_bfe_sdiv: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 1, 16 +; GCN2s-NEXT: v_lshrrev_b32_e32 v1, 31, v0 +; GCN2s-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GCN2s-NEXT: v_ashrrev_i32_e32 v0, 1, v0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %src = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) %div = sdiv i32 %bfe, 2 @@ -411,6 +1315,25 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_0_width(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; GCN1-LABEL: bfe_0_width: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_0_width: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %ptr, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0) store i32 %bfe, ptr addrspace(1) %out, align 4 @@ -422,6 +1345,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_8_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; GCN1-LABEL: bfe_8_bfe_8: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_8_bfe_8: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %ptr, align 4 %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8) @@ -433,6 +1391,41 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; GCN: s_endpgm define amdgpu_kernel void @bfe_8_bfe_16(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; GCN1-LABEL: bfe_8_bfe_16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_8_bfe_16: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %ptr, align 4 %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8) %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16) @@ -446,6 +1439,41 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @bfe_16_bfe_8(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; GCN1-LABEL: bfe_16_bfe_8: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: bfe_16_bfe_8: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %load = load i32, ptr addrspace(1) %ptr, align 4 %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16) %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8) @@ -459,6 +1487,33 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN1-LABEL: sext_in_reg_i8_to_i32_bfe: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_i32 s2, s2, s3 +; GCN1-NEXT: s_sext_i32_i8 s2, s2 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sext_in_reg_i8_to_i32_bfe: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_add_i32 s0, s2, s3 +; GCN2s-NEXT: s_sext_i32_i8 s0, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: v_mov_b32_e32 v0, s0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8) %shl = shl i32 %bfe, 24 @@ -469,6 +1524,25 @@ ; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN1-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0) %shl = shl i32 %bfe, 24 @@ -482,6 +1556,39 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @sextload_i8_to_i32_bfe(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; GCN1-LABEL: sextload_i8_to_i32_bfe: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sextload_i8_to_i32_bfe: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %ptr, align 1 %sext = sext i8 %load to i32 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8) @@ -496,6 +1603,25 @@ ; GCN-NOT: {{[^@]}}bfe ; GCN: s_endpgm define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { +; GCN1-LABEL: sextload_i8_to_i32_bfe_0: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: v_mov_b32_e32 v0, 0 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sextload_i8_to_i32_bfe_0: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s3, 0xf000 +; GCN2s-NEXT: s_mov_b32 s2, -1 +; GCN2s-NEXT: v_mov_b32_e32 v0, 0 +; GCN2s-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN2s-NEXT: s_endpgm %load = load i8, ptr addrspace(1) %ptr, align 1 %sext = sext i8 %load to i32 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0) @@ -511,6 +1637,41 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 ; GCN: s_endpgm define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: sext_in_reg_i1_bfe_offset_0: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sext_in_reg_i1_bfe_offset_0: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = shl i32 %x, 31 %shr = ashr i32 %shl, 31 @@ -526,6 +1687,41 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 ; GCN: s_endpgm define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: sext_in_reg_i1_bfe_offset_1: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 1, 1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sext_in_reg_i1_bfe_offset_1: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 1, 1 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = shl i32 %x, 30 %shr = ashr i32 %shl, 30 @@ -542,6 +1738,43 @@ ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 ; GCN: s_endpgm define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN1-LABEL: sext_in_reg_i2_bfe_offset_1: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_bfe_i32 v0, v0, 0, 2 +; GCN1-NEXT: v_bfe_i32 v0, v0, 1, 2 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2s-LABEL: sext_in_reg_i2_bfe_offset_1: +; GCN2s: ; %bb.0: +; GCN2s-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2s-NEXT: s_mov_b32 s7, 0xf000 +; GCN2s-NEXT: s_mov_b32 s6, -1 +; GCN2s-NEXT: s_mov_b32 s10, s6 +; GCN2s-NEXT: s_mov_b32 s11, s7 +; GCN2s-NEXT: s_waitcnt lgkmcnt(0) +; GCN2s-NEXT: s_mov_b32 s8, s2 +; GCN2s-NEXT: s_mov_b32 s9, s3 +; GCN2s-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN2s-NEXT: s_mov_b32 s4, s0 +; GCN2s-NEXT: s_mov_b32 s5, s1 +; GCN2s-NEXT: s_waitcnt vmcnt(0) +; GCN2s-NEXT: v_bfe_i32 v0, v0, 0, 2 +; GCN2s-NEXT: v_bfe_i32 v0, v0, 1, 2 +; GCN2s-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN2s-NEXT: s_endpgm %x = load i32, ptr addrspace(1) %in, align 4 %shl = shl i32 %x, 30 %shr = ashr i32 %shl, 30 Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -154,9 +154,6 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v7 ; W64-O0-NEXT: v_mov_b32_e32 v2, v6 @@ -500,9 +497,6 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v15, v5 ; W64-O0-NEXT: s_waitcnt vmcnt(3) @@ -518,9 +512,6 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v3, v8 ; W64-O0-NEXT: v_mov_b32_e32 v4, v7 @@ -532,7 +523,6 @@ ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v2, v12 ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill @@ -540,7 +530,6 @@ ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v10 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill @@ -1007,9 +996,6 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v1, v11 ; W64-O0-NEXT: v_mov_b32_e32 v2, v10 @@ -1018,9 +1004,6 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v14, v7 ; W64-O0-NEXT: v_mov_b32_e32 v15, v6 @@ -1032,7 +1015,6 @@ ; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 killed $exec ; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; W64-O0-NEXT: v_mov_b32_e32 v5, v12 ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -40,10 +40,7 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -87,10 +84,7 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -160,10 +154,7 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -207,10 +198,7 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -280,10 +268,7 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -327,10 +312,7 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -399,18 +381,15 @@ ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec - ; ADDR64-NEXT: %17:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 + ; ADDR64-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]].sub0, [[COPY1]].sub0, 0, implicit $exec + ; ADDR64-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY6]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] @@ -428,18 +407,15 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; W32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 822173696 ; W32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY9]].sub0, [[COPY1]].sub0, 0, implicit $exec - ; W32-NEXT: %17:vgpr_32, dead %20:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %17, %subreg.sub1 + ; W32-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY6]].sub0, [[COPY1]].sub0, 0, implicit $exec + ; W32-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY6]].sub1, [[COPY1]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; W32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; W32-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec ; W32-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; W32-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] @@ -484,16 +460,13 @@ ; ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 - ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 ; ADDR64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 ; ADDR64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 - ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]].sub0, %subreg.sub0, [[COPY9]].sub1, %subreg.sub1 + ; ADDR64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]].sub0, %subreg.sub0, [[COPY6]].sub1, %subreg.sub1 ; ADDR64-NEXT: [[BUFFER_LOAD_FORMAT_X_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]] ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]] @@ -512,10 +485,7 @@ ; W64-NO-ADDR64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: .1: @@ -559,10 +529,7 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: Index: llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -97,8 +97,7 @@ ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 ; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576 ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 @@ -374,8 +373,7 @@ ; GCN-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_LOAD_DWORDX2_IMM1]].sub0, [[REG_SEQUENCE]].sub0, implicit-def $vcc, implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1 ; GCN-NEXT: [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_CO_U32_e32_]], implicit $exec - ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1 ; GCN-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 killed [[REG_SEQUENCE1]].sub0, 12, implicit $exec ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: S_BRANCH %bb.1 Index: llvm/test/CodeGen/AMDGPU/sdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv.ll +++ llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -29,13 +29,13 @@ ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v2 ; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 ; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -47,7 +47,7 @@ ; GCN-NEXT: v_mul_lo_u32 v4, v3, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v1, v0 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v0, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -55,7 +55,7 @@ ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -74,13 +74,13 @@ ; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v2 ; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v1 ; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v5, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v5 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v5 ; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 @@ -92,7 +92,7 @@ ; TONGA-NEXT: v_mul_lo_u32 v4, v3, v1 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v1, v0 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v0, v1 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc @@ -100,7 +100,7 @@ ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v2 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v2, v0 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -218,7 +218,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -239,7 +239,7 @@ ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -315,7 +315,7 @@ ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -338,7 +338,7 @@ ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -410,8 +410,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v5 @@ -428,16 +428,16 @@ ; GCN-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GCN-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v7 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v6, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v6 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v5, v10 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v7, v11 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -448,9 +448,9 @@ ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; GCN-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v4 @@ -462,8 +462,8 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v9 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v8, v0 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v9, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -483,8 +483,8 @@ ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 ; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v5 @@ -501,16 +501,16 @@ ; TONGA-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 ; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 ; TONGA-NEXT: v_mul_lo_u32 v11, v11, v7 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v6 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v4, v5, v10 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v6 ; TONGA-NEXT: v_mul_hi_u32 v6, v7, v11 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v6, v7 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v6 ; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 ; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 ; TONGA-NEXT: v_mul_lo_u32 v6, v4, v2 @@ -521,9 +521,9 @@ ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; TONGA-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 ; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v4 @@ -535,8 +535,8 @@ ; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v9 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v8, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v9, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v9 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm ; @@ -713,8 +713,8 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 ; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -738,8 +738,8 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 ; TONGA-NEXT: v_lshrrev_b32_e32 v2, 30, v2 ; TONGA-NEXT: v_lshrrev_b32_e32 v3, 30, v3 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -824,22 +824,22 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 ; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v10, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 ; GCN-NEXT: v_xor_b32_e32 v5, v5, v11 ; GCN-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 ; GCN-NEXT: v_xor_b32_e32 v16, v10, v11 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v10 ; GCN-NEXT: v_cvt_f32_u32_e32 v8, v4 ; GCN-NEXT: v_cvt_f32_u32_e32 v10, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; GCN-NEXT: v_xor_b32_e32 v6, v6, v13 ; GCN-NEXT: v_xor_b32_e32 v17, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v12 @@ -862,12 +862,12 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v8, v9 ; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 ; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v14 ; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 ; GCN-NEXT: v_xor_b32_e32 v7, v7, v14 ; GCN-NEXT: v_cvt_f32_u32_e32 v18, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v8 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v12, v13 @@ -885,13 +885,13 @@ ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v21 -; GCN-NEXT: v_subrev_i32_e32 v11, vcc, v4, v0 +; GCN-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 ; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; GCN-NEXT: v_subrev_i32_e32 v12, vcc, v5, v1 +; GCN-NEXT: v_sub_i32_e32 v12, vcc, v1, v5 ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] ; GCN-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 ; GCN-NEXT: v_add_i32_e32 v22, vcc, 1, v10 -; GCN-NEXT: v_subrev_i32_e32 v13, vcc, v6, v2 +; GCN-NEXT: v_sub_i32_e32 v13, vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v8 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] @@ -903,27 +903,27 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v18, v4 ; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GCN-NEXT: v_xor_b32_e32 v3, v3, v8 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v18, v4 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] ; GCN-NEXT: v_xor_b32_e32 v0, v0, v15 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v16 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v15, v0 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v16, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v10 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v4, v7 ; GCN-NEXT: v_xor_b32_e32 v2, v2, v17 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v17, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_xor_b32_e32 v6, v8, v14 ; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GCN-NEXT: v_subrev_i32_e32 v8, vcc, v7, v3 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, v3, v7 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc @@ -931,7 +931,7 @@ ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GCN-NEXT: v_xor_b32_e32 v3, v3, v6 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; @@ -955,22 +955,22 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 ; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v5 ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v11, v5 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v9 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; TONGA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v8, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v10 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 ; TONGA-NEXT: v_xor_b32_e32 v5, v5, v11 ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 ; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 ; TONGA-NEXT: v_xor_b32_e32 v16, v10, v11 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v13, v6 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v10 ; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v4 ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v12, v2 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v12 ; TONGA-NEXT: v_xor_b32_e32 v6, v6, v13 ; TONGA-NEXT: v_xor_b32_e32 v17, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v12 @@ -993,12 +993,12 @@ ; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 ; TONGA-NEXT: v_mul_hi_u32 v11, v10, v11 ; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v7 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v14, v7 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, v7, v14 ; TONGA-NEXT: v_mul_hi_u32 v13, v12, v13 ; TONGA-NEXT: v_xor_b32_e32 v7, v7, v14 ; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v7 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v10 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v11 ; TONGA-NEXT: v_mul_hi_u32 v8, v0, v8 ; TONGA-NEXT: v_mul_hi_u32 v9, v1, v9 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v13 @@ -1016,13 +1016,13 @@ ; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v5 ; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v21 -; TONGA-NEXT: v_subrev_u32_e32 v11, vcc, v4, v0 +; TONGA-NEXT: v_sub_u32_e32 v11, vcc, v0, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[0:1] -; TONGA-NEXT: v_subrev_u32_e32 v12, vcc, v5, v1 +; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v1, v5 ; TONGA-NEXT: v_cndmask_b32_e64 v9, v9, v20, s[2:3] ; TONGA-NEXT: v_sub_u32_e32 v19, vcc, 0, v7 ; TONGA-NEXT: v_add_u32_e32 v22, vcc, 1, v10 -; TONGA-NEXT: v_subrev_u32_e32 v13, vcc, v6, v2 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v2, v6 ; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v11, s[0:1] ; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v8 ; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] @@ -1034,27 +1034,27 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v3 ; TONGA-NEXT: v_mul_hi_u32 v4, v18, v4 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v8 ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v8 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v18 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v18, v4 ; TONGA-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 ; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 ; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v22, s[4:5] ; TONGA-NEXT: v_xor_b32_e32 v0, v0, v15 ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 ; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v13, s[4:5] -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v15, v0 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v16, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v15 +; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v16 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v10 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; TONGA-NEXT: v_cndmask_b32_e32 v2, v10, v5, vcc ; TONGA-NEXT: v_mul_lo_u32 v5, v4, v7 ; TONGA-NEXT: v_xor_b32_e32 v2, v2, v17 -; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, v17, v2 +; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; TONGA-NEXT: v_xor_b32_e32 v6, v8, v14 ; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v4 -; TONGA-NEXT: v_subrev_u32_e32 v8, vcc, v7, v3 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v3, v7 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc @@ -1062,7 +1062,7 @@ ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; TONGA-NEXT: v_xor_b32_e32 v3, v3, v6 -; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 +; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; @@ -1360,10 +1360,10 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 ; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 ; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 @@ -1393,10 +1393,10 @@ ; TONGA-NEXT: v_lshrrev_b32_e32 v5, 30, v5 ; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6 ; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7 -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 ; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 ; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 @@ -1510,7 +1510,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1543,7 +1543,7 @@ ; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -1665,7 +1665,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1708,7 +1708,7 @@ ; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -1852,7 +1852,7 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1893,7 +1893,7 @@ ; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 ; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -2030,7 +2030,7 @@ ; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2038,7 +2038,7 @@ ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2078,7 +2078,7 @@ ; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 -; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 +; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -2086,7 +2086,7 @@ ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; TONGA-NEXT: s_endpgm @@ -2254,7 +2254,7 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -2286,7 +2286,7 @@ ; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 ; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; TONGA-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -36,8 +36,8 @@ ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -59,7 +59,7 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -1105,8 +1105,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1129,7 +1129,7 @@ ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,18 +1,60 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s - -; GCN-LABEL: {{^}}add_shr_i32: -; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] -; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa - -; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10: v_add_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; NOSDWA-LABEL: add_shr_i32: +; NOSDWA: ; %bb.0: +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: flat_load_dword v2, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: add_shr_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: add_shr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_shr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in, align 4 %shr = lshr i32 %a, 16 %add = add i32 %a, %shr @@ -20,15 +62,57 @@ ret void } -; GCN-LABEL: {{^}}sub_shr_i32: -; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_subrev_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]] -; NOSDWA-NOT: v_subrev_{{(_co)?}}_u32_sdwa - -; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10: v_sub_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; NOSDWA-LABEL: sub_shr_i32: +; NOSDWA: ; %bb.0: +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: flat_load_dword v2, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: sub_shr_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: sub_shr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sub_shr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in, align 4 %shr = lshr i32 %a, 16 %sub = sub i32 %shr, %a @@ -36,15 +120,81 @@ ret void } -; GCN-LABEL: {{^}}mul_shr_i32: -; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST0]], v[[DST1]] -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 - define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; NOSDWA-LABEL: mul_shr_i32: +; NOSDWA: ; %bb.0: +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dword v4, v[0:1] +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_mul_u32_u24_e32 v2, v3, v2 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_shr_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_u32_u24_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_shr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_shr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr i32, ptr addrspace(1) %in1, i32 %idx %gep2 = getelementptr i32, ptr addrspace(1) %in2, i32 %idx @@ -57,14 +207,78 @@ ret void } -; GCN-LABEL: {{^}}mul_i16: -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa -; GFX89: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX10: v_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-NOT: v_mul_u32_u24_sdwa - define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] +; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 +; NOSDWA-NEXT: flat_store_short v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v2, v4, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr i16, ptr addrspace(1) %ina, i32 %idx @@ -76,21 +290,85 @@ ret void } -; GCN-LABEL: {{^}}mul_v2i16: -; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] -; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] - -; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v2i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dword v4, v[0:1] +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v4, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v2 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v2i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v3, v4, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <2 x i16>, ptr addrspace(1) %ina, i32 %idx @@ -102,25 +380,96 @@ ret void } -; GCN-LABEL: {{^}}mul_v4i16: -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] - -; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v4i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v1, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v7, v0, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v3 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v2 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_or_b32_e32 v1, v6, v1 +; NOSDWA-NEXT: v_or_b32_e32 v0, v7, v0 +; NOSDWA-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v4i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v6, v1, v3 +; VI-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_lo_u16_e32 v3, v0, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v4i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v4i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v3 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <4 x i16>, ptr addrspace(1) %ina, i32 %idx @@ -132,33 +481,118 @@ ret void } -; GCN-LABEL: {{^}}mul_v8i16: -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_lo_u16_e32 v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] - -; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v8i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; NOSDWA-NEXT: v_mov_b32_e32 v8, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v9, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_lo_u16_e32 v10, v3, v7 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v11, v2, v6 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v12, v1, v5 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v13, v0, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v6 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v5 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_or_b32_e32 v3, v10, v3 +; NOSDWA-NEXT: v_or_b32_e32 v2, v11, v2 +; NOSDWA-NEXT: v_or_b32_e32 v1, v12, v1 +; NOSDWA-NEXT: v_or_b32_e32 v0, v13, v0 +; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v8i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v10, v3, v7 +; VI-NEXT: v_mul_lo_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_lo_u16_e32 v7, v2, v6 +; VI-NEXT: v_mul_lo_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_lo_u16_e32 v6, v1, v5 +; VI-NEXT: v_mul_lo_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_lo_u16_e32 v5, v0, v4 +; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v3, v10, v3 +; VI-NEXT: v_or_b32_e32 v2, v7, v2 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v8i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v4 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v8i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 4, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_lo_u16 v3, v3, v7 +; GFX10-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX10-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v4 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <8 x i16>, ptr addrspace(1) %ina, i32 %idx @@ -170,13 +604,70 @@ ret void } -; GCN-LABEL: {{^}}mul_half: -; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-NOT: v_mul_f16_sdwa - define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_half: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] +; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_f16_e32 v2, v4, v2 +; NOSDWA-NEXT: flat_store_short v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_half: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_e32 v2, v4, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_half: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_half: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX10-NEXT: global_store_short v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %a = load half, ptr addrspace(1) %ina, align 4 %b = load half, ptr addrspace(1) %inb, align 4 @@ -185,21 +676,78 @@ ret void } -; GCN-LABEL: {{^}}mul_v2half: -; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] -; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] -; NOSDWA-NOT: v_mul_f16_sdwa - -; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] - -; GFX9_10: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v2half: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: flat_load_dword v3, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; NOSDWA-NEXT: v_mul_f16_e32 v4, v5, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mul_f16_e32 v2, v3, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v4 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v2half: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v2, v4, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v2half: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v2half: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %a = load <2 x half>, ptr addrspace(1) %ina, align 4 %b = load <2 x half>, ptr addrspace(1) %inb, align 4 @@ -208,23 +756,89 @@ ret void } -; GCN-LABEL: {{^}}mul_v4half: -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_f16_sdwa - -; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v4half: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; NOSDWA-NEXT: v_mul_f16_e32 v1, v1, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; NOSDWA-NEXT: v_mul_f16_e32 v0, v0, v2 +; NOSDWA-NEXT: v_mul_f16_e32 v2, v7, v6 +; NOSDWA-NEXT: v_mul_f16_e32 v3, v8, v3 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v2 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v3 +; NOSDWA-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v4half: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v1, v1, v3 +; VI-NEXT: v_mul_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v4half: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v4half: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm entry: %a = load <4 x half>, ptr addrspace(1) %ina, align 4 %b = load <4 x half>, ptr addrspace(1) %inb, align 4 @@ -233,29 +847,111 @@ ret void } -; GCN-LABEL: {{^}}mul_v8half: -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_f16_sdwa - -; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v8half: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; NOSDWA-NEXT: v_mov_b32_e32 v8, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v9, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; NOSDWA-NEXT: v_mul_f16_e32 v3, v7, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; NOSDWA-NEXT: v_mul_f16_e32 v2, v6, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; NOSDWA-NEXT: v_mul_f16_e32 v1, v5, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; NOSDWA-NEXT: v_mul_f16_e32 v0, v4, v0 +; NOSDWA-NEXT: v_mul_f16_e32 v4, v11, v10 +; NOSDWA-NEXT: v_mul_f16_e32 v7, v12, v7 +; NOSDWA-NEXT: v_mul_f16_e32 v6, v13, v6 +; NOSDWA-NEXT: v_mul_f16_e32 v5, v14, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v4 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v7 +; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v6 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v5 +; NOSDWA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v8half: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; VI-NEXT: v_mov_b32_e32 v8, s4 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_sdwa v10, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v3, v3, v7 +; VI-NEXT: v_mul_f16_sdwa v7, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v2, v2, v6 +; VI-NEXT: v_mul_f16_sdwa v6, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v1, v1, v5 +; VI-NEXT: v_mul_f16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_e32 v0, v0, v4 +; VI-NEXT: v_or_b32_e32 v3, v3, v10 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v8half: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v3, v3, v7 +; GFX9-NEXT: v_pk_mul_f16 v2, v2, v6 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4 +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v8half: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v3, v3, v7 +; GFX10-NEXT: v_pk_mul_f16 v2, v2, v6 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v5 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v4 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GFX10-NEXT: s_endpgm entry: %a = load <8 x half>, ptr addrspace(1) %ina, align 4 %b = load <8 x half>, ptr addrspace(1) %inb, align 4 @@ -264,14 +960,74 @@ ret void } -; GCN-LABEL: {{^}}mul_i8: -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa -; GFX89: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX10: v_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-NOT: v_mul_u32_u24_sdwa - define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_i8: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v2, s7 +; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0 +; NOSDWA-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v4, s1 +; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; NOSDWA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; NOSDWA-NEXT: flat_load_ubyte v2, v[1:2] +; NOSDWA-NEXT: flat_load_ubyte v3, v[3:4] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v3 +; NOSDWA-NEXT: flat_store_byte v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v0 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: flat_load_ubyte v2, v[1:2] +; VI-NEXT: flat_load_ubyte v3, v[3:4] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v2, v2, v3 +; VI-NEXT: flat_store_byte v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_byte v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_i8: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr i8, ptr addrspace(1) %ina, i32 %idx @@ -283,27 +1039,95 @@ ret void } -; GCN-LABEL: {{^}}mul_v2i8: -; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 - -; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 - -; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD - -; GFX10: v_lshlrev_b16 v{{[0-9]+}}, 8, v -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v2i8: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] +; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b16_e32 v3, 8, v4 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b16_e32 v5, 8, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v5 +; NOSDWA-NEXT: v_and_b32_e32 v2, 0xff, v2 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 +; NOSDWA-NEXT: flat_store_short v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v2i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v3, v4, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v2i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: global_store_short v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v2i8: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2 +; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mul_lo_u16 v0, v0, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: global_store_short v2, v0, s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <2 x i8>, ptr addrspace(1) %ina, i32 %idx @@ -315,28 +1139,124 @@ ret void } -; GCN-LABEL: {{^}}mul_v4i8: -; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa - -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa - -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 - define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v4i8: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: flat_load_dword v4, v[0:1] +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dword v2, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v6, 8, v4 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v9, 8, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v4, v6, v9 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v5, v5, v8 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7 +; NOSDWA-NEXT: v_and_b32_e32 v2, 0xff, v2 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; NOSDWA-NEXT: v_and_b32_e32 v3, 0xff, v3 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v4 +; NOSDWA-NEXT: v_or_b32_e32 v3, v3, v5 +; NOSDWA-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v4i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v3, v4, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v5, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI-NEXT: v_mul_lo_u16_sdwa v6, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; VI-NEXT: v_mul_lo_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v4i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_lo_u16_e32 v3, v1, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v4i8: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1 +; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v4, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-NEXT: v_mul_lo_u16 v3, v3, v4 +; GFX10-NEXT: v_mul_lo_u16 v0, v0, v5 +; GFX10-NEXT: v_mul_lo_u16 v2, v6, v7 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <4 x i8>, ptr addrspace(1) %ina, i32 %idx @@ -348,38 +1268,172 @@ ret void } -; GCN-LABEL: {{^}}mul_v8i8: -; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa -; VI-DAG: v_mul_lo_u16_sdwa - -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa -; GFX9-DAG: v_mul_lo_u16_sdwa - -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 -; GFX10-DAG: v_mul_lo_u16 - define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mul_v8i8: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v8, 8, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v11, 8, v1 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v14, 8, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v16, 24, v3 +; NOSDWA-NEXT: v_lshrrev_b16_e32 v17, 8, v3 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v3 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v11, v17 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v10, v16 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v9, v9, v15 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v8, v8, v14 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v7, v7, v13 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v6, v12 +; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1 +; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; NOSDWA-NEXT: v_and_b32_e32 v9, 0xff, v9 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; NOSDWA-NEXT: v_and_b32_e32 v6, 0xff, v6 +; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v9, v3 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v8 +; NOSDWA-NEXT: v_or_b32_e32 v3, v6, v7 +; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v2 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v3 +; NOSDWA-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mul_v8i8: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v6, v1, v3 +; VI-NEXT: v_mul_lo_u16_sdwa v7, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI-NEXT: v_mul_lo_u16_sdwa v8, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; VI-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_lo_u16_e32 v3, v0, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v9, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI-NEXT: v_mul_lo_u16_sdwa v10, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mul_v8i8: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_lo_u16_e32 v5, v1, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_mul_lo_u16_e32 v3, v0, v2 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mul_v8i8: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b16 v6, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX10-NEXT: v_lshrrev_b16 v9, 8, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX10-NEXT: v_lshrrev_b16 v12, 8, v2 +; GFX10-NEXT: v_lshrrev_b16 v13, 8, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX10-NEXT: v_mul_lo_u16 v1, v1, v3 +; GFX10-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: v_mul_lo_u16 v2, v9, v13 +; GFX10-NEXT: v_mul_lo_u16 v3, v8, v14 +; GFX10-NEXT: v_mul_lo_u16 v6, v6, v12 +; GFX10-NEXT: v_mul_lo_u16 v5, v5, v11 +; GFX10-NEXT: v_mul_lo_u16 v7, v7, v15 +; GFX10-NEXT: v_mul_lo_u16 v4, v4, v10 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX10-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <8 x i8>, ptr addrspace(1) %ina, i32 %idx @@ -391,17 +1445,67 @@ ret void } -; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16: -; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_cvt_f16_i16_sdwa - -; SDWA-DAG: v_cvt_f16_i16_e32 v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_cvt_f16_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:{{(WORD_1|DWORD)?}} dst_unused:UNUSED_PAD src0_sel:WORD_1 - -; FIXME: Should be able to avoid or define amdgpu_kernel void @sitofp_v2i16_to_v2f16( +; NOSDWA-LABEL: sitofp_v2i16_to_v2f16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: flat_load_dword v2, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; NOSDWA-NEXT: v_cvt_f16_i16_e32 v3, v3 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_cvt_f16_i16_e32 v2, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: sitofp_v2i16_to_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_i16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f16_i16_e32 v2, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: sitofp_v2i16_to_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1 +; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sitofp_v2i16_to_v2f16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1 +; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -411,22 +1515,83 @@ ret void } - -; GCN-LABEL: {{^}}mac_v2half: -; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST0]], v[[DST1]] -; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] -; NOSDWA-NOT: v_mac_f16_sdwa - -; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] - -; GFX9_10: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]] -; GFX9_10: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]] - define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mac_v2half: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: flat_load_dword v3, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; NOSDWA-NEXT: v_mac_f16_e32 v4, v5, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_mac_f16_e32 v2, v3, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v4 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mac_v2half: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v4, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_mac_f16_e32 v2, v3, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mac_v2half: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mac_v2half: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v2 +; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %a = load <2 x half>, ptr addrspace(1) %ina, align 4 %b = load <2 x half>, ptr addrspace(1) %inb, align 4 @@ -436,18 +1601,70 @@ ret void } -; GCN-LABEL: {{^}}immediate_mul_v2i16: -; NOSDWA-NOT: v_mul_u32_u24_sdwa -; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 -; VI-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} -; VI-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD - -; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b -; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]] - -; GFX10: v_pk_mul_lo_u16 v{{[0-9]+}}, 0x141007b, v{{[0-9]+}} - define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; NOSDWA-LABEL: immediate_mul_v2i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s3 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: flat_load_dword v2, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, 0x7b, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, 0x141, v2 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v2 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: immediate_mul_v2i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x141 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v4, 0x7b, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: immediate_mul_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x141007b +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s2 +; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: immediate_mul_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %idx @@ -457,21 +1674,94 @@ ret void } -; Double use of same src - should not convert it -; GCN-LABEL: {{^}}mulmul_v2i16: -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_mul_u32_u24_sdwa - -; VI: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD - -; GFX9_10: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}} - define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mulmul_v2i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: flat_load_dword v3, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v5, v5, v4 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v3, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v5, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: mulmul_v2i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_lo_u16_e32 v5, v3, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_lo_u16_e32 v2, v5, v2 +; VI-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: mulmul_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mulmul_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() %gepa = getelementptr <2 x i16>, ptr addrspace(1) %ina, i32 %idx @@ -484,16 +1774,79 @@ ret void } -; GCN-LABEL: {{^}}add_bb_v2i16: -; NOSDWA-NOT: v_add_{{(_co)?}}_u32_sdwa - -; VI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_add_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} -; VI-NEXT: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD - -; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: add_bb_v2i16: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: flat_load_dword v1, v[0:1] +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v4 +; NOSDWA-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v1, v2 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; VI-LABEL: add_bb_v2i16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: flat_load_dword v1, v[0:1] +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: add_bb_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: add_bb_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm entry: %a = load <2 x i16>, ptr addrspace(1) %ina, align 4 %b = load <2 x i16>, ptr addrspace(1) %inb, align 4 @@ -506,39 +1859,124 @@ ret void } - -; Check that "pulling out" SDWA operands works correctly. -; GCN-LABEL: {{^}}pulled_out_test: -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}} -; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, v{{[0-9]+}} -; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; NOSDWA-NOT: v_and_b32_sdwa -; NOSDWA-NOT: v_or_b32_sdwa - -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; -; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 { +; NOSDWA-LABEL: pulled_out_test: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 +; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v2, s2 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s3 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b64 v[4:5], 24, v[0:1] +; NOSDWA-NEXT: v_and_b32_e32 v6, 0xff, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_and_b32_e32 v5, 0xff, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; NOSDWA-NEXT: v_or_b32_e32 v6, v6, v7 +; NOSDWA-NEXT: v_or_b32_e32 v5, v5, v8 +; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v9 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 +; NOSDWA-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_or_b32_e32 v0, v6, v0 +; NOSDWA-NEXT: v_or_b32_e32 v1, v4, v1 +; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; NOSDWA-NEXT: s_endpgm ; -; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-LABEL: pulled_out_test: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v6, 8 +; VI-NEXT: v_mov_b32_e32 v7, 0xff +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_sdwa v8, v6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_sdwa v6, v6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; VI-NEXT: v_and_b32_sdwa v5, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v6, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm ; -; GFX89: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-LABEL: pulled_out_test: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX9-NEXT: s_movk_i32 s0, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_sdwa v6, v5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v5, v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm ; -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD - -define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 { +; GFX10-LABEL: pulled_out_test: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_mov_b32_e32 v6, 0xff +; GFX10-NEXT: v_mov_b32_e32 v7, 24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v8, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v7, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX10-NEXT: s_endpgm entry: %idxprom = ashr exact i64 15, 32 %arrayidx = getelementptr inbounds <8 x i8>, ptr addrspace(1) %sourceA, i64 %idxprom @@ -571,15 +2009,73 @@ ret void } -; GCN-LABEL: {{^}}sdwa_crash_inlineasm_def: -; GCN: s_mov_b32 s{{[0-9]+}}, 0xffff -; GCN: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { +; NOSDWA-LABEL: sdwa_crash_inlineasm_def: +; NOSDWA: ; %bb.0: ; %bb +; NOSDWA-NEXT: s_mov_b32 s0, 0xffff +; NOSDWA-NEXT: ;;#ASMSTART +; NOSDWA-NEXT: v_and_b32_e32 v0, s0, v0 +; NOSDWA-NEXT: ;;#ASMEND +; NOSDWA-NEXT: v_or_b32_e32 v0, 0x10000, v0 +; NOSDWA-NEXT: s_and_b64 vcc, exec, -1 +; NOSDWA-NEXT: .LBB21_1: ; %bb1 +; NOSDWA-NEXT: ; =>This Inner Loop Header: Depth=1 +; NOSDWA-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: s_mov_b64 vcc, vcc +; NOSDWA-NEXT: s_cbranch_vccnz .LBB21_1 +; NOSDWA-NEXT: ; %bb.2: ; %DummyReturnBlock +; NOSDWA-NEXT: s_endpgm ; -; TODO: Why is the constant not peepholed into the v_or_b32_e32? +; VI-LABEL: sdwa_crash_inlineasm_def: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: ;;#ASMSTART +; VI-NEXT: v_and_b32_e32 v0, s0, v0 +; VI-NEXT: ;;#ASMEND +; VI-NEXT: v_or_b32_e32 v0, 0x10000, v0 +; VI-NEXT: s_and_b64 vcc, exec, -1 +; VI-NEXT: .LBB21_1: ; %bb1 +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b64 vcc, vcc +; VI-NEXT: s_cbranch_vccnz .LBB21_1 +; VI-NEXT: ; %bb.2: ; %DummyReturnBlock +; VI-NEXT: s_endpgm ; -; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, -; SDWA: v_or_b32_e32 v{{[0-9]+}}, 0x10000, -define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 { +; GFX9-LABEL: sdwa_crash_inlineasm_def: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_or_b32_e32 v0, 0x10000, v0 +; GFX9-NEXT: s_and_b64 vcc, exec, -1 +; GFX9-NEXT: .LBB21_1: ; %bb1 +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b64 vcc, vcc +; GFX9-NEXT: s_cbranch_vccnz .LBB21_1 +; GFX9-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sdwa_crash_inlineasm_def: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_or_b32_e32 v0, 0x10000, v0 +; GFX10-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX10-NEXT: .LBB21_1: ; %bb1 +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_cbranch_vccnz .LBB21_1 +; GFX10-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX10-NEXT: s_endpgm bb: br label %bb1 Index: llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -1283,12 +1283,10 @@ ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 -; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec ; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: ; implicit-def: $sgpr4 ; GCN-NEXT: ; implicit-def: $sgpr4 -; GCN-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GCN-NEXT: v_mov_b32_e32 v3, v5 ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 Index: llvm/test/CodeGen/AMDGPU/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl.ll +++ llvm/test/CodeGen/AMDGPU/shl.ll @@ -83,9 +83,9 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, v7, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, v5, v1 +; SI-NEXT: v_lshl_b32_e32 v3, v3, v7 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v5 ; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -153,7 +153,7 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/sra.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sra.ll +++ llvm/test/CodeGen/AMDGPU/sra.ll @@ -85,9 +85,9 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 -; SI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 -; SI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 +; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 +; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 +; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 ; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -28,8 +28,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -49,7 +49,7 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -909,8 +909,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -930,7 +930,7 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -1148,7 +1148,7 @@ ; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1303,8 +1303,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -1324,7 +1324,7 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 Index: llvm/test/CodeGen/AMDGPU/srl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srl.ll +++ llvm/test/CodeGen/AMDGPU/srl.ll @@ -20,7 +20,7 @@ ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -138,9 +138,9 @@ ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshr_b32_e32 v3, v3, v7 -; SI-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; SI-NEXT: v_lshrrev_b32_e32 v1, v5, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, v4, v0 +; SI-NEXT: v_lshr_b32_e32 v2, v2, v6 +; SI-NEXT: v_lshr_b32_e32 v1, v1, v5 +; SI-NEXT: v_lshr_b32_e32 v0, v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/sub.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sub.ll +++ llvm/test/CodeGen/AMDGPU/sub.ll @@ -1,6 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN1 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN2 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN3 %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable @@ -8,6 +9,39 @@ ; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}] ; GCN: s_sub_i32 s{{[0-9]+}}, s[[#LOAD + 2]], s[[#LOAD + 3]] define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; GCN1-LABEL: s_sub_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_sub_i32 s0, s2, s3 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: s_sub_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_sub_i32 s2, s2, s3 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: s_sub_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_sub_i32 s2, s2, s3 +; GCN3-NEXT: v_mov_b32_e32 v1, s2 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] +; GCN3-NEXT: s_endpgm %result = sub i32 %a, %b store i32 %result, ptr addrspace(1) %out ret void @@ -17,6 +51,40 @@ ; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]] define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { +; GCN1-LABEL: s_sub_imm_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_sub_i32 s4, 0x4d2, s4 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: s_sub_imm_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_sub_i32 s2, 0x4d2, s2 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: s_sub_imm_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_sub_i32 s0, 0x4d2, s4 +; GCN3-NEXT: v_mov_b32_e32 v1, s0 +; GCN3-NEXT: global_store_dword v0, v1, s[2:3] +; GCN3-NEXT: s_endpgm %result = sub i32 1234, %a store i32 %result, ptr addrspace(1) %out ret void @@ -26,6 +94,48 @@ ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}} ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; GCN2-NEXT: flat_store_dword v[2:3], v0 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v0, v0, v1 +; GCN3-NEXT: global_store_dword v2, v0, s[0:1] +; GCN3-NEXT: s_endpgm %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in %b = load i32, ptr addrspace(1) %b_ptr @@ -38,6 +148,48 @@ ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}} ; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_imm_i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, 0x7b, v0 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_imm_i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_load_dword v2, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2 +; GCN2-NEXT: flat_store_dword v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_imm_i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dword v1, v0, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v1, 0x7b, v1 +; GCN3-NEXT: global_store_dword v0, v1, s[0:1] +; GCN3-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %in %result = sub i32 123, %a store i32 %result, ptr addrspace(1) %out @@ -51,6 +203,51 @@ ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v2i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v2i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GCN2-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v2i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v1, v1, v3 +; GCN3-NEXT: v_sub_u32_e32 v0, v0, v2 +; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GCN3-NEXT: s_endpgm %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -70,6 +267,64 @@ ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v4i32: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s10, s6 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b32 s8, s2 +; GCN1-NEXT: s_mov_b32 s9, s3 +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v4i32: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: s_add_u32 s2, s2, 16 +; GCN2-NEXT: s_addc_u32 s3, s3, 0 +; GCN2-NEXT: v_mov_b32_e32 v5, s3 +; GCN2-NEXT: v_mov_b32_e32 v4, s2 +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN2-NEXT: v_mov_b32_e32 v8, s0 +; GCN2-NEXT: v_mov_b32_e32 v9, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GCN2-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GCN2-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v4i32: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v8, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 +; GCN3-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v7, v3 +; GCN3-NEXT: v_sub_u32_e32 v2, v6, v2 +; GCN3-NEXT: v_sub_u32_e32 v1, v5, v1 +; GCN3-NEXT: v_sub_u32_e32 v0, v4, v0 +; GCN3-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN3-NEXT: s_endpgm %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -82,6 +337,60 @@ ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, ; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_i16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, 0 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:2 glc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GCN1-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_i16: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_ushort v4, v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: flat_load_ushort v2, v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_sub_u16_e32 v2, v4, v2 +; GCN2-NEXT: flat_store_short v[0:1], v2 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_i16: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v0, 0 +; GCN3-NEXT: v_sub_u16_e32 v1, v1, v2 +; GCN3-NEXT: global_store_short v0, v1, s[0:1] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1 @@ -98,6 +407,60 @@ ; GFX9: v_pk_sub_i16 define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v2i16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, 0 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN1-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v2, v3 +; GCN1-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN1-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN1-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v2i16: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u16_e32 v4, v0, v1 +; GCN2-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GCN2-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN2-NEXT: flat_store_dword v[2:3], v0 +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v2i16: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_pk_sub_i16 v0, v0, v1 +; GCN3-NEXT: global_store_dword v2, v0, s[0:1] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1 @@ -117,6 +480,71 @@ ; GFX9: v_pk_sub_i16 ; GFX9: v_pk_sub_i16 define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN1-LABEL: test_sub_v4i16: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s7, 0xf000 +; GCN1-NEXT: s_mov_b32 s10, 0 +; GCN1-NEXT: s_mov_b32 s11, s7 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; GCN1-NEXT: s_mov_b32 s6, -1 +; GCN1-NEXT: s_mov_b32 s4, s0 +; GCN1-NEXT: s_mov_b32 s5, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN1-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GCN1-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GCN1-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN1-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v5, v7 +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v6 +; GCN1-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN1-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN1-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN1-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN1-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN1-NEXT: v_or_b32_e32 v0, v0, v3 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: test_sub_v4i16: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v4, s0 +; GCN2-NEXT: v_mov_b32_e32 v5, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u16_e32 v6, v1, v3 +; GCN2-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GCN2-NEXT: v_sub_u16_e32 v3, v0, v2 +; GCN2-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GCN2-NEXT: v_or_b32_e32 v1, v6, v1 +; GCN2-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN2-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: test_sub_v4i16: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_pk_sub_i16 v1, v1, v3 +; GCN3-NEXT: v_pk_sub_i16 v0, v0, v2 +; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1 @@ -131,6 +559,46 @@ ; GCN: s_sub_u32 ; GCN: s_subb_u32 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { +; GCN1-LABEL: s_sub_i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN1-NEXT: s_mov_b32 s3, 0xf000 +; GCN1-NEXT: s_mov_b32 s2, -1 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_sub_u32 s4, s4, s6 +; GCN1-NEXT: s_subb_u32 s5, s5, s7 +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: s_sub_i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_sub_u32 s2, s4, s6 +; GCN2-NEXT: s_subb_u32 s3, s5, s7 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: s_sub_i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN3-NEXT: v_mov_b32_e32 v2, 0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: s_sub_u32 s0, s4, s6 +; GCN3-NEXT: s_subb_u32 s1, s5, s7 +; GCN3-NEXT: v_mov_b32_e32 v0, s0 +; GCN3-NEXT: v_mov_b32_e32 v1, s1 +; GCN3-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GCN3-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 ret void @@ -146,6 +614,65 @@ ; GFX9: v_sub_co_u32_e32 ; GFX9: v_subb_co_u32_e32 define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { +; GCN1-LABEL: v_sub_i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s14, 0 +; GCN1-NEXT: s_mov_b32 s15, s11 +; GCN1-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN1-NEXT: v_mov_b32_e32 v1, 0 +; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN1-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s4 +; GCN1-NEXT: s_mov_b32 s9, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN1-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: v_sub_i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GCN2-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: v_sub_i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GCN3-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GCN3-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid @@ -172,6 +699,71 @@ ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { +; GCN1-LABEL: v_test_sub_v2i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s14, 0 +; GCN1-NEXT: s_mov_b32 s15, s11 +; GCN1-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GCN1-NEXT: v_mov_b32_e32 v5, 0 +; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s4 +; GCN1-NEXT: s_mov_b32 s9, s5 +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: v_test_sub_v2i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: v_lshlrev_b32_e32 v2, 4, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, s0, v2 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: v_test_sub_v2i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: v_lshlrev_b32_e32 v8, 4, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] +; GCN3-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] +; GCN3-NEXT: v_mov_b32_e32 v8, 0 +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GCN3-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -210,6 +802,103 @@ ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { +; GCN1-LABEL: v_test_sub_v4i64: +; GCN1: ; %bb.0: +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_mov_b32 s11, 0xf000 +; GCN1-NEXT: s_mov_b32 s14, 0 +; GCN1-NEXT: s_mov_b32 s15, s11 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_mov_b64 s[12:13], s[6:7] +; GCN1-NEXT: v_lshlrev_b32_e32 v12, 5, v0 +; GCN1-NEXT: v_mov_b32_e32 v13, 0 +; GCN1-NEXT: s_mov_b64 s[2:3], s[14:15] +; GCN1-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[0:3], 0 addr64 +; GCN1-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[0:3], 0 addr64 offset:16 +; GCN1-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16 +; GCN1-NEXT: s_mov_b32 s10, -1 +; GCN1-NEXT: s_mov_b32 s8, s4 +; GCN1-NEXT: s_mov_b32 s9, s5 +; GCN1-NEXT: s_waitcnt vmcnt(2) +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v15, v11, vcc +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v13, v9, vcc +; GCN1-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN1-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: v_test_sub_v4i64: +; GCN2: ; %bb.0: +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_add_u32_e32 v8, vcc, s6, v0 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_add_u32_e32 v12, vcc, s0, v0 +; GCN2-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dwordx4 v[0:3], v[8:9] +; GCN2-NEXT: flat_load_dwordx4 v[4:7], v[12:13] +; GCN2-NEXT: v_add_u32_e32 v8, vcc, 16, v8 +; GCN2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN2-NEXT: v_add_u32_e32 v12, vcc, 16, v12 +; GCN2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN2-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GCN2-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; GCN2-NEXT: v_mov_b32_e32 v17, s5 +; GCN2-NEXT: v_mov_b32_e32 v16, s4 +; GCN2-NEXT: s_add_u32 s0, s4, 16 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 +; GCN2-NEXT: s_waitcnt vmcnt(2) +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN2-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v10, v14 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v8, v12 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: v_test_sub_v4i64: +; GCN3: ; %bb.0: +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN3-NEXT: v_lshlrev_b32_e32 v16, 5, v0 +; GCN3-NEXT: s_waitcnt lgkmcnt(0) +; GCN3-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GCN3-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] +; GCN3-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:16 +; GCN3-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v16, 0 +; GCN3-NEXT: s_waitcnt vmcnt(2) +; GCN3-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 +; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GCN3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v10, v14 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v12 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v13, vcc +; GCN3-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16 +; GCN3-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GCN3-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid @@ -232,6 +921,43 @@ ; GCN: ds_write_b32 ; GCN: ; use vcc define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) { +; GCN1-LABEL: sub_select_vop3: +; GCN1: ; %bb.0: +; GCN1-NEXT: v_subrev_i32_e64 v0, s[0:1], s0, v0 +; GCN1-NEXT: s_mov_b32 m0, -1 +; GCN1-NEXT: ;;#ASMSTART +; GCN1-NEXT: ; def vcc +; GCN1-NEXT: ;;#ASMEND +; GCN1-NEXT: ds_write_b32 v0, v0 +; GCN1-NEXT: ;;#ASMSTART +; GCN1-NEXT: ; use vcc +; GCN1-NEXT: ;;#ASMEND +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: sub_select_vop3: +; GCN2: ; %bb.0: +; GCN2-NEXT: v_subrev_u32_e64 v0, s[0:1], s0, v0 +; GCN2-NEXT: s_mov_b32 m0, -1 +; GCN2-NEXT: ;;#ASMSTART +; GCN2-NEXT: ; def vcc +; GCN2-NEXT: ;;#ASMEND +; GCN2-NEXT: ds_write_b32 v0, v0 +; GCN2-NEXT: ;;#ASMSTART +; GCN2-NEXT: ; use vcc +; GCN2-NEXT: ;;#ASMEND +; GCN2-NEXT: s_endpgm +; +; GCN3-LABEL: sub_select_vop3: +; GCN3: ; %bb.0: +; GCN3-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GCN3-NEXT: ;;#ASMSTART +; GCN3-NEXT: ; def vcc +; GCN3-NEXT: ;;#ASMEND +; GCN3-NEXT: ds_write_b32 v0, v0 +; GCN3-NEXT: ;;#ASMSTART +; GCN3-NEXT: ; use vcc +; GCN3-NEXT: ;;#ASMEND +; GCN3-NEXT: s_endpgm %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() %sub = sub i32 %v, %s store i32 %sub, ptr addrspace(3) undef Index: llvm/test/CodeGen/AMDGPU/udiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv.ll +++ llvm/test/CodeGen/AMDGPU/udiv.ll @@ -32,7 +32,7 @@ ; SI-NEXT: v_mul_lo_u32 v3, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, v1, v0 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -68,7 +68,7 @@ ; VI-NEXT: v_mul_lo_u32 v3, v2, v1 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v1, v0 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v0, v1 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -100,7 +100,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v4, v1 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v1, v0 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v0, v1 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc @@ -198,7 +198,7 @@ ; SI-NEXT: v_mul_lo_u32 v1, s4, v0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_mul_hi_u32 v0, s2, v0 ; SI-NEXT: v_readfirstlane_b32 s0, v0 ; SI-NEXT: s_mul_i32 s0, s0, s3 @@ -231,7 +231,7 @@ ; VI-NEXT: v_mul_lo_u32 v1, s4, v0 ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; VI-NEXT: v_mul_hi_u32 v0, s2, v0 ; VI-NEXT: v_readfirstlane_b32 s0, v0 ; VI-NEXT: s_mul_i32 s0, s0, s3 @@ -260,7 +260,7 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 -; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NEXT: s_mul_i32 s4, s4, s3 @@ -372,7 +372,7 @@ ; SI-NEXT: v_mul_lo_u32 v7, v7, v5 ; SI-NEXT: v_mul_hi_u32 v6, v4, v6 ; SI-NEXT: v_mul_hi_u32 v7, v5, v7 -; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_mul_hi_u32 v4, v0, v4 ; SI-NEXT: v_mul_hi_u32 v5, v1, v5 @@ -384,9 +384,9 @@ ; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; SI-NEXT: v_subrev_i32_e32 v6, vcc, v2, v0 +; SI-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v7, vcc, v3, v1 +; SI-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 ; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4 @@ -427,7 +427,7 @@ ; VI-NEXT: v_mul_lo_u32 v7, v7, v5 ; VI-NEXT: v_mul_hi_u32 v6, v4, v6 ; VI-NEXT: v_mul_hi_u32 v7, v5, v7 -; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 ; VI-NEXT: v_mul_hi_u32 v4, v0, v4 ; VI-NEXT: v_mul_hi_u32 v5, v1, v5 @@ -439,9 +439,9 @@ ; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5 ; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; VI-NEXT: v_subrev_u32_e32 v6, vcc, v2, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v7, vcc, v3, v1 +; VI-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 ; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] ; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4 @@ -478,7 +478,7 @@ ; GCN-NEXT: v_mov_b32_e32 v4, s0 ; GCN-NEXT: v_mov_b32_e32 v5, s1 ; GCN-NEXT: v_mul_hi_u32 v8, v7, v8 -; GCN-NEXT: v_add_u32_e32 v6, vcc, v9, v6 +; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 ; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 @@ -490,9 +490,9 @@ ; GCN-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_subrev_u32_e32 v8, vcc, v2, v0 +; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v9, vcc, v3, v1 +; GCN-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] ; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6 @@ -661,9 +661,9 @@ ; SI-NEXT: v_mul_hi_u32 v13, v12, v13 ; SI-NEXT: v_mul_hi_u32 v15, v14, v15 ; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, v11, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v11 ; SI-NEXT: v_add_i32_e32 v10, vcc, v12, v13 -; SI-NEXT: v_add_i32_e32 v11, vcc, v15, v14 +; SI-NEXT: v_add_i32_e32 v11, vcc, v14, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_hi_u32 v8, v4, v8 ; SI-NEXT: v_mul_hi_u32 v9, v5, v9 @@ -685,13 +685,13 @@ ; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; SI-NEXT: v_subrev_i32_e32 v12, vcc, v0, v4 +; SI-NEXT: v_sub_i32_e32 v12, vcc, v4, v0 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; SI-NEXT: v_subrev_i32_e32 v13, vcc, v1, v5 +; SI-NEXT: v_sub_i32_e32 v13, vcc, v5, v1 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; SI-NEXT: v_subrev_i32_e32 v14, vcc, v2, v6 +; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; SI-NEXT: v_subrev_i32_e32 v15, vcc, v3, v7 +; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 ; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 @@ -756,9 +756,9 @@ ; VI-NEXT: v_mul_hi_u32 v13, v12, v13 ; VI-NEXT: v_mul_hi_u32 v15, v14, v15 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; VI-NEXT: v_add_u32_e32 v9, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11 ; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13 -; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v11, vcc, v14, v15 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_hi_u32 v8, v4, v8 ; VI-NEXT: v_mul_hi_u32 v9, v5, v9 @@ -780,13 +780,13 @@ ; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; VI-NEXT: v_subrev_u32_e32 v12, vcc, v0, v4 +; VI-NEXT: v_sub_u32_e32 v12, vcc, v4, v0 ; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] -; VI-NEXT: v_subrev_u32_e32 v13, vcc, v1, v5 +; VI-NEXT: v_sub_u32_e32 v13, vcc, v5, v1 ; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] -; VI-NEXT: v_subrev_u32_e32 v14, vcc, v2, v6 +; VI-NEXT: v_sub_u32_e32 v14, vcc, v6, v2 ; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] -; VI-NEXT: v_subrev_u32_e32 v15, vcc, v3, v7 +; VI-NEXT: v_sub_u32_e32 v15, vcc, v7, v3 ; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] ; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 @@ -851,9 +851,9 @@ ; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 ; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 ; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11 -; GCN-NEXT: v_add_u32_e32 v11, vcc, v13, v12 +; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 ; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 -; GCN-NEXT: v_add_u32_e32 v13, vcc, v17, v16 +; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 ; GCN-NEXT: v_mul_hi_u32 v11, v5, v11 @@ -875,13 +875,13 @@ ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 -; GCN-NEXT: v_subrev_u32_e32 v14, vcc, v0, v4 +; GCN-NEXT: v_sub_u32_e32 v14, vcc, v4, v0 ; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v15, vcc, v1, v5 +; GCN-NEXT: v_sub_u32_e32 v15, vcc, v5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] -; GCN-NEXT: v_subrev_u32_e32 v16, vcc, v2, v6 +; GCN-NEXT: v_sub_u32_e32 v16, vcc, v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v19, s[4:5] -; GCN-NEXT: v_subrev_u32_e32 v17, vcc, v3, v7 +; GCN-NEXT: v_sub_u32_e32 v17, vcc, v7, v3 ; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v21, s[6:7] ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[0:1] ; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v10 @@ -1891,7 +1891,7 @@ ; SI-NEXT: v_mul_lo_u32 v3, v1, v0 ; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, v0, v2 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v2, v0 ; SI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -1938,7 +1938,7 @@ ; VI-NEXT: v_mul_lo_u32 v3, v1, v0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, v0, v2 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v2, v0 ; VI-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -1993,7 +1993,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v4, v3 ; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_subrev_u32_e32 v5, vcc, v3, v2 +; GCN-NEXT: v_sub_u32_e32 v5, vcc, v2, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc @@ -2641,7 +2641,7 @@ ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, v3, v4 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3 @@ -2728,7 +2728,7 @@ ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 ; GCN-NEXT: v_sub_u32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 +; GCN-NEXT: v_add_u32_e32 v8, vcc, v3, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 ; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -26,8 +26,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -48,7 +48,7 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -698,8 +698,8 @@ ; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 ; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 @@ -721,7 +721,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 @@ -758,7 +758,7 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, 2, v1 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0 @@ -898,8 +898,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -922,7 +922,7 @@ ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -1366,8 +1366,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v0, s8 ; GCN-NEXT: v_mul_lo_u32 v3, v1, s8 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s8 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -1536,7 +1536,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v5, v3, s4 ; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 -; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 Index: llvm/test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udivrem.ll +++ llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -175,7 +175,7 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -193,7 +193,7 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_readfirstlane_b32 s6, v0 ; GFX6-NEXT: s_mul_i32 s6, s6, s7 @@ -223,7 +223,7 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -241,7 +241,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -362,7 +362,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -380,7 +380,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -450,7 +450,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -468,7 +468,7 @@ ; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 Index: llvm/test/CodeGen/AMDGPU/urem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem.ll +++ llvm/test/CodeGen/AMDGPU/urem.ll @@ -1,15 +1,118 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC1 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC2 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s ; The code generated by urem is long and complex and may frequently ; change. The goal of this test is to make sure the ISel doesn't fail ; when it gets a v2i32/v4i32 urem -; FUNC-LABEL: {{^}}test_urem_i32: -; SI: s_endpgm -; EG: CF_END define amdgpu_kernel void @test_urem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_i32: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s7, 0xf000 +; FUNC1-NEXT: s_mov_b32 s6, -1 +; FUNC1-NEXT: s_mov_b32 s10, s6 +; FUNC1-NEXT: s_mov_b32 s11, s7 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s8, s2 +; FUNC1-NEXT: s_mov_b32 s9, s3 +; FUNC1-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; FUNC1-NEXT: s_mov_b32 s4, s0 +; FUNC1-NEXT: s_mov_b32 s5, s1 +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_readfirstlane_b32 s2, v1 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, s2 +; FUNC1-NEXT: s_sub_i32 s3, 0, s2 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_lo_u32 v2, s3, v1 +; FUNC1-NEXT: v_readfirstlane_b32 s3, v0 +; FUNC1-NEXT: v_mul_hi_u32 v2, v1, v2 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v1, v2 +; FUNC1-NEXT: v_mul_hi_u32 v0, s3, v0 +; FUNC1-NEXT: v_readfirstlane_b32 s0, v0 +; FUNC1-NEXT: s_mul_i32 s0, s0, s2 +; FUNC1-NEXT: s_sub_i32 s0, s3, s0 +; FUNC1-NEXT: s_sub_i32 s1, s0, s2 +; FUNC1-NEXT: s_cmp_ge_u32 s0, s2 +; FUNC1-NEXT: s_cselect_b32 s0, s1, s0 +; FUNC1-NEXT: s_sub_i32 s1, s0, s2 +; FUNC1-NEXT: s_cmp_ge_u32 s0, s2 +; FUNC1-NEXT: s_cselect_b32 s0, s1, s0 +; FUNC1-NEXT: v_mov_b32_e32 v0, s0 +; FUNC1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FUNC1-NEXT: s_endpgm +; +; FUNC2-LABEL: test_urem_i32: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s7, 0xf000 +; FUNC2-NEXT: s_mov_b32 s6, -1 +; FUNC2-NEXT: s_mov_b32 s10, s6 +; FUNC2-NEXT: s_mov_b32 s11, s7 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s8, s2 +; FUNC2-NEXT: s_mov_b32 s9, s3 +; FUNC2-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; FUNC2-NEXT: s_mov_b32 s4, s0 +; FUNC2-NEXT: s_mov_b32 s5, s1 +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_readfirstlane_b32 s2, v1 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, s2 +; FUNC2-NEXT: s_sub_i32 s3, 0, s2 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_lo_u32 v2, s3, v1 +; FUNC2-NEXT: v_readfirstlane_b32 s3, v0 +; FUNC2-NEXT: v_mul_hi_u32 v2, v1, v2 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v1, v2 +; FUNC2-NEXT: v_mul_hi_u32 v0, s3, v0 +; FUNC2-NEXT: v_readfirstlane_b32 s0, v0 +; FUNC2-NEXT: s_mul_i32 s0, s0, s2 +; FUNC2-NEXT: s_sub_i32 s0, s3, s0 +; FUNC2-NEXT: s_sub_i32 s1, s0, s2 +; FUNC2-NEXT: s_cmp_ge_u32 s0, s2 +; FUNC2-NEXT: s_cselect_b32 s0, s1, s0 +; FUNC2-NEXT: s_sub_i32 s1, s0, s2 +; FUNC2-NEXT: s_cmp_ge_u32 s0, s2 +; FUNC2-NEXT: s_cselect_b32 s0, s1, s0 +; FUNC2-NEXT: v_mov_b32_e32 v0, s0 +; FUNC2-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FUNC2-NEXT: s_endpgm +; +; EG-LABEL: test_urem_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: SUB_INT T0.W, 0.0, T0.Y, +; EG-NEXT: RECIP_UINT * T0.Z, T0.Y, +; EG-NEXT: MULLO_INT * T0.W, PV.W, PS, +; EG-NEXT: MULHI * T0.W, T0.Z, PS, +; EG-NEXT: ADD_INT * T0.W, T0.Z, PS, +; EG-NEXT: MULHI * T0.Z, T0.X, PV.W, +; EG-NEXT: MULLO_INT * T0.Z, PS, T0.Y, +; EG-NEXT: SUB_INT * T0.W, T0.X, PS, +; EG-NEXT: SETGE_UINT T1.W, PV.W, T0.Y, +; EG-NEXT: SUB_INT * T2.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT * T0.W, PV.W, T0.W, PS, +; EG-NEXT: SETGE_UINT T1.W, PV.W, T0.Y, +; EG-NEXT: SUB_INT * T2.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 %a = load i32, ptr addrspace(1) %in %b = load i32, ptr addrspace(1) %b_ptr @@ -18,25 +121,250 @@ ret void } -; FUNC-LABEL: {{^}}test_urem_i32_7: -; SI: s_mov_b32 [[MAGIC:s[0-9]+]], 0x24924925 -; SI: v_mul_hi_u32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]] -; SI: v_sub_{{[iu]}}32 -; SI: v_mul_lo_u32 -; SI: v_subrev_{{[iu]}}32 -; SI: buffer_store_dword -; SI: s_endpgm define amdgpu_kernel void @test_urem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_i32_7: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s7, 0xf000 +; FUNC1-NEXT: s_mov_b32 s6, -1 +; FUNC1-NEXT: s_mov_b32 s10, s6 +; FUNC1-NEXT: s_mov_b32 s11, s7 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s8, s2 +; FUNC1-NEXT: s_mov_b32 s9, s3 +; FUNC1-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; FUNC1-NEXT: s_mov_b32 s2, 0x24924925 +; FUNC1-NEXT: s_mov_b32 s4, s0 +; FUNC1-NEXT: s_mov_b32 s5, s1 +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_mul_hi_u32 v1, v0, s2 +; FUNC1-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; FUNC1-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; FUNC1-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; FUNC1-NEXT: v_mul_lo_u32 v1, v1, 7 +; FUNC1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; FUNC1-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FUNC1-NEXT: s_endpgm +; +; FUNC2-LABEL: test_urem_i32_7: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s7, 0xf000 +; FUNC2-NEXT: s_mov_b32 s6, -1 +; FUNC2-NEXT: s_mov_b32 s10, s6 +; FUNC2-NEXT: s_mov_b32 s11, s7 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s8, s2 +; FUNC2-NEXT: s_mov_b32 s9, s3 +; FUNC2-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; FUNC2-NEXT: s_mov_b32 s2, 0x24924925 +; FUNC2-NEXT: s_mov_b32 s4, s0 +; FUNC2-NEXT: s_mov_b32 s5, s1 +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_mul_hi_u32 v1, v0, s2 +; FUNC2-NEXT: v_sub_u32_e32 v2, vcc, v0, v1 +; FUNC2-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; FUNC2-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; FUNC2-NEXT: v_mul_lo_u32 v1, v1, 7 +; FUNC2-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; FUNC2-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FUNC2-NEXT: s_endpgm +; +; EG-LABEL: test_urem_i32_7: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MULHI * T0.Y, T0.X, literal.x, +; EG-NEXT: 613566757(6.344132e-17), 0(0.000000e+00) +; EG-NEXT: SUB_INT * T0.W, T0.X, PS, +; EG-NEXT: LSHR * T0.W, PV.W, 1, +; EG-NEXT: ADD_INT * T0.W, PV.W, T0.Y, +; EG-NEXT: LSHR * T0.W, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MULLO_INT * T0.Y, PV.W, literal.x, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: SUB_INT T0.X, T0.X, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %num = load i32, ptr addrspace(1) %in %result = urem i32 %num, 7 store i32 %result, ptr addrspace(1) %out ret void } -; FUNC-LABEL: {{^}}test_urem_v2i32: -; SI: s_endpgm -; EG: CF_END define amdgpu_kernel void @test_urem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_v2i32: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s3, 0xf000 +; FUNC1-NEXT: s_mov_b32 s2, -1 +; FUNC1-NEXT: s_mov_b32 s10, s2 +; FUNC1-NEXT: s_mov_b32 s11, s3 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s8, s6 +; FUNC1-NEXT: s_mov_b32 s9, s7 +; FUNC1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_readfirstlane_b32 s0, v2 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v2, s0 +; FUNC1-NEXT: s_sub_i32 s1, 0, s0 +; FUNC1-NEXT: v_readfirstlane_b32 s6, v3 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v3, s6 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; FUNC1-NEXT: v_readfirstlane_b32 s8, v1 +; FUNC1-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v2, v2 +; FUNC1-NEXT: v_mul_lo_u32 v4, s1, v2 +; FUNC1-NEXT: v_readfirstlane_b32 s1, v0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v0, v3 +; FUNC1-NEXT: v_mul_hi_u32 v4, v2, v4 +; FUNC1-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC1-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; FUNC1-NEXT: v_mul_hi_u32 v2, s1, v2 +; FUNC1-NEXT: v_readfirstlane_b32 s7, v2 +; FUNC1-NEXT: s_mul_i32 s7, s7, s0 +; FUNC1-NEXT: s_sub_i32 s1, s1, s7 +; FUNC1-NEXT: s_sub_i32 s7, s1, s0 +; FUNC1-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC1-NEXT: s_cselect_b32 s1, s7, s1 +; FUNC1-NEXT: s_sub_i32 s7, s1, s0 +; FUNC1-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC1-NEXT: s_cselect_b32 s7, s7, s1 +; FUNC1-NEXT: s_sub_i32 s0, 0, s6 +; FUNC1-NEXT: v_mul_lo_u32 v2, s0, v0 +; FUNC1-NEXT: s_mov_b32 s0, s4 +; FUNC1-NEXT: s_mov_b32 s1, s5 +; FUNC1-NEXT: v_mul_hi_u32 v2, v0, v2 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; FUNC1-NEXT: v_mul_hi_u32 v0, s8, v0 +; FUNC1-NEXT: v_readfirstlane_b32 s4, v0 +; FUNC1-NEXT: s_mul_i32 s4, s4, s6 +; FUNC1-NEXT: s_sub_i32 s4, s8, s4 +; FUNC1-NEXT: s_sub_i32 s5, s4, s6 +; FUNC1-NEXT: s_cmp_ge_u32 s4, s6 +; FUNC1-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC1-NEXT: s_sub_i32 s5, s4, s6 +; FUNC1-NEXT: s_cmp_ge_u32 s4, s6 +; FUNC1-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC1-NEXT: v_mov_b32_e32 v0, s7 +; FUNC1-NEXT: v_mov_b32_e32 v1, s4 +; FUNC1-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FUNC1-NEXT: s_endpgm +; +; FUNC2-LABEL: test_urem_v2i32: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s3, 0xf000 +; FUNC2-NEXT: s_mov_b32 s2, -1 +; FUNC2-NEXT: s_mov_b32 s10, s2 +; FUNC2-NEXT: s_mov_b32 s11, s3 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s8, s6 +; FUNC2-NEXT: s_mov_b32 s9, s7 +; FUNC2-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_readfirstlane_b32 s0, v2 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v2, s0 +; FUNC2-NEXT: s_sub_i32 s1, 0, s0 +; FUNC2-NEXT: v_readfirstlane_b32 s6, v3 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v3, s6 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; FUNC2-NEXT: v_readfirstlane_b32 s8, v1 +; FUNC2-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v2, v2 +; FUNC2-NEXT: v_mul_lo_u32 v4, s1, v2 +; FUNC2-NEXT: v_readfirstlane_b32 s1, v0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v0, v3 +; FUNC2-NEXT: v_mul_hi_u32 v4, v2, v4 +; FUNC2-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC2-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; FUNC2-NEXT: v_mul_hi_u32 v2, s1, v2 +; FUNC2-NEXT: v_readfirstlane_b32 s7, v2 +; FUNC2-NEXT: s_mul_i32 s7, s7, s0 +; FUNC2-NEXT: s_sub_i32 s1, s1, s7 +; FUNC2-NEXT: s_sub_i32 s7, s1, s0 +; FUNC2-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC2-NEXT: s_cselect_b32 s1, s7, s1 +; FUNC2-NEXT: s_sub_i32 s7, s1, s0 +; FUNC2-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC2-NEXT: s_cselect_b32 s7, s7, s1 +; FUNC2-NEXT: s_sub_i32 s0, 0, s6 +; FUNC2-NEXT: v_mul_lo_u32 v2, s0, v0 +; FUNC2-NEXT: s_mov_b32 s0, s4 +; FUNC2-NEXT: s_mov_b32 s1, s5 +; FUNC2-NEXT: v_mul_hi_u32 v2, v0, v2 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; FUNC2-NEXT: v_mul_hi_u32 v0, s8, v0 +; FUNC2-NEXT: v_readfirstlane_b32 s4, v0 +; FUNC2-NEXT: s_mul_i32 s4, s4, s6 +; FUNC2-NEXT: s_sub_i32 s4, s8, s4 +; FUNC2-NEXT: s_sub_i32 s5, s4, s6 +; FUNC2-NEXT: s_cmp_ge_u32 s4, s6 +; FUNC2-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC2-NEXT: s_sub_i32 s5, s4, s6 +; FUNC2-NEXT: s_cmp_ge_u32 s4, s6 +; FUNC2-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC2-NEXT: v_mov_b32_e32 v0, s7 +; FUNC2-NEXT: v_mov_b32_e32 v1, s4 +; FUNC2-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FUNC2-NEXT: s_endpgm +; +; EG-LABEL: test_urem_v2i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 29, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: SUB_INT T1.W, 0.0, T0.W, +; EG-NEXT: RECIP_UINT * T1.X, T0.W, +; EG-NEXT: MULLO_INT * T1.Y, PV.W, PS, +; EG-NEXT: SUB_INT T1.W, 0.0, T0.Z, +; EG-NEXT: RECIP_UINT * T1.Z, T0.Z, +; EG-NEXT: MULLO_INT * T1.W, PV.W, PS, +; EG-NEXT: MULHI * T1.W, T1.Z, PS, +; EG-NEXT: ADD_INT T1.W, T1.Z, PS, +; EG-NEXT: MULHI * T1.Y, T1.X, T1.Y, +; EG-NEXT: ADD_INT T2.W, T1.X, PS, +; EG-NEXT: MULHI * T1.X, T0.X, PV.W, +; EG-NEXT: MULHI * T1.Y, T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T1.Y, PS, T0.W, +; EG-NEXT: SUB_INT T1.W, T0.Y, PS, +; EG-NEXT: MULLO_INT * T0.Y, T1.X, T0.Z, +; EG-NEXT: SUB_INT T1.Z, T0.X, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T2.Z, PV.W, T1.W, PS, +; EG-NEXT: SETGE_UINT T1.W, PV.Z, T0.Z, +; EG-NEXT: SUB_INT * T2.W, PV.Z, T0.Z, +; EG-NEXT: CNDE_INT T1.Z, PV.W, T1.Z, PS, +; EG-NEXT: SETGE_UINT T1.W, PV.Z, T0.W, +; EG-NEXT: SUB_INT * T0.W, PV.Z, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, T2.Z, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T0.W, PV.Z, T0.Z, +; EG-NEXT: SUB_INT * T1.W, PV.Z, T0.Z, +; EG-NEXT: CNDE_INT T0.X, PV.W, T1.Z, PS, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -45,10 +373,285 @@ ret void } -; FUNC-LABEL: {{^}}test_urem_v4i32: -; SI: s_endpgm -; EG: CF_END define amdgpu_kernel void @test_urem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_v4i32: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s3, 0xf000 +; FUNC1-NEXT: s_mov_b32 s2, -1 +; FUNC1-NEXT: s_mov_b32 s10, s2 +; FUNC1-NEXT: s_mov_b32 s11, s3 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s8, s6 +; FUNC1-NEXT: s_mov_b32 s9, s7 +; FUNC1-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; FUNC1-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; FUNC1-NEXT: s_waitcnt vmcnt(1) +; FUNC1-NEXT: v_readfirstlane_b32 s0, v0 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v0, s0 +; FUNC1-NEXT: s_sub_i32 s1, 0, s0 +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_readfirstlane_b32 s6, v4 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_lo_u32 v8, s1, v0 +; FUNC1-NEXT: v_readfirstlane_b32 s1, v1 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, s1 +; FUNC1-NEXT: v_mul_hi_u32 v8, v0, v8 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; FUNC1-NEXT: v_mul_hi_u32 v0, s6, v0 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_readfirstlane_b32 s7, v0 +; FUNC1-NEXT: s_mul_i32 s7, s7, s0 +; FUNC1-NEXT: s_sub_i32 s6, s6, s7 +; FUNC1-NEXT: s_sub_i32 s7, s6, s0 +; FUNC1-NEXT: s_cmp_ge_u32 s6, s0 +; FUNC1-NEXT: s_cselect_b32 s6, s7, s6 +; FUNC1-NEXT: s_sub_i32 s7, s6, s0 +; FUNC1-NEXT: s_cmp_ge_u32 s6, s0 +; FUNC1-NEXT: s_cselect_b32 s6, s7, s6 +; FUNC1-NEXT: s_sub_i32 s0, 0, s1 +; FUNC1-NEXT: v_mul_lo_u32 v0, s0, v1 +; FUNC1-NEXT: v_readfirstlane_b32 s0, v2 +; FUNC1-NEXT: v_readfirstlane_b32 s7, v5 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v2, s0 +; FUNC1-NEXT: v_mul_hi_u32 v0, v1, v0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; FUNC1-NEXT: v_mul_hi_u32 v0, s7, v0 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_readfirstlane_b32 s8, v0 +; FUNC1-NEXT: s_mul_i32 s8, s8, s1 +; FUNC1-NEXT: s_sub_i32 s7, s7, s8 +; FUNC1-NEXT: s_sub_i32 s8, s7, s1 +; FUNC1-NEXT: s_cmp_ge_u32 s7, s1 +; FUNC1-NEXT: s_cselect_b32 s7, s8, s7 +; FUNC1-NEXT: s_sub_i32 s8, s7, s1 +; FUNC1-NEXT: s_cmp_ge_u32 s7, s1 +; FUNC1-NEXT: s_cselect_b32 s7, s8, s7 +; FUNC1-NEXT: s_sub_i32 s1, 0, s0 +; FUNC1-NEXT: v_mul_lo_u32 v0, s1, v1 +; FUNC1-NEXT: v_readfirstlane_b32 s8, v3 +; FUNC1-NEXT: v_readfirstlane_b32 s1, v6 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v2, s8 +; FUNC1-NEXT: v_mul_hi_u32 v0, v1, v0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; FUNC1-NEXT: v_mul_hi_u32 v0, s1, v0 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_readfirstlane_b32 s9, v0 +; FUNC1-NEXT: s_mul_i32 s9, s9, s0 +; FUNC1-NEXT: s_sub_i32 s1, s1, s9 +; FUNC1-NEXT: s_sub_i32 s9, s1, s0 +; FUNC1-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC1-NEXT: s_cselect_b32 s1, s9, s1 +; FUNC1-NEXT: s_sub_i32 s9, s1, s0 +; FUNC1-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC1-NEXT: s_cselect_b32 s9, s9, s1 +; FUNC1-NEXT: s_sub_i32 s0, 0, s8 +; FUNC1-NEXT: v_mul_lo_u32 v0, s0, v1 +; FUNC1-NEXT: s_mov_b32 s0, s4 +; FUNC1-NEXT: v_readfirstlane_b32 s4, v7 +; FUNC1-NEXT: s_mov_b32 s1, s5 +; FUNC1-NEXT: v_mul_hi_u32 v0, v1, v0 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; FUNC1-NEXT: v_mul_hi_u32 v2, s4, v0 +; FUNC1-NEXT: v_mov_b32_e32 v0, s6 +; FUNC1-NEXT: v_mov_b32_e32 v1, s7 +; FUNC1-NEXT: v_readfirstlane_b32 s5, v2 +; FUNC1-NEXT: s_mul_i32 s5, s5, s8 +; FUNC1-NEXT: s_sub_i32 s4, s4, s5 +; FUNC1-NEXT: s_sub_i32 s5, s4, s8 +; FUNC1-NEXT: s_cmp_ge_u32 s4, s8 +; FUNC1-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC1-NEXT: s_sub_i32 s5, s4, s8 +; FUNC1-NEXT: s_cmp_ge_u32 s4, s8 +; FUNC1-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC1-NEXT: v_mov_b32_e32 v2, s9 +; FUNC1-NEXT: v_mov_b32_e32 v3, s4 +; FUNC1-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; FUNC1-NEXT: s_endpgm +; +; FUNC2-LABEL: test_urem_v4i32: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s3, 0xf000 +; FUNC2-NEXT: s_mov_b32 s2, -1 +; FUNC2-NEXT: s_mov_b32 s10, s2 +; FUNC2-NEXT: s_mov_b32 s11, s3 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s8, s6 +; FUNC2-NEXT: s_mov_b32 s9, s7 +; FUNC2-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; FUNC2-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; FUNC2-NEXT: s_waitcnt vmcnt(1) +; FUNC2-NEXT: v_readfirstlane_b32 s0, v0 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v0, s0 +; FUNC2-NEXT: s_sub_i32 s1, 0, s0 +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_readfirstlane_b32 s6, v4 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_lo_u32 v8, s1, v0 +; FUNC2-NEXT: v_readfirstlane_b32 s1, v1 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, s1 +; FUNC2-NEXT: v_mul_hi_u32 v8, v0, v8 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; FUNC2-NEXT: v_mul_hi_u32 v0, s6, v0 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_readfirstlane_b32 s7, v0 +; FUNC2-NEXT: s_mul_i32 s7, s7, s0 +; FUNC2-NEXT: s_sub_i32 s6, s6, s7 +; FUNC2-NEXT: s_sub_i32 s7, s6, s0 +; FUNC2-NEXT: s_cmp_ge_u32 s6, s0 +; FUNC2-NEXT: s_cselect_b32 s6, s7, s6 +; FUNC2-NEXT: s_sub_i32 s7, s6, s0 +; FUNC2-NEXT: s_cmp_ge_u32 s6, s0 +; FUNC2-NEXT: s_cselect_b32 s6, s7, s6 +; FUNC2-NEXT: s_sub_i32 s0, 0, s1 +; FUNC2-NEXT: v_mul_lo_u32 v0, s0, v1 +; FUNC2-NEXT: v_readfirstlane_b32 s0, v2 +; FUNC2-NEXT: v_readfirstlane_b32 s7, v5 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v2, s0 +; FUNC2-NEXT: v_mul_hi_u32 v0, v1, v0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; FUNC2-NEXT: v_mul_hi_u32 v0, s7, v0 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_readfirstlane_b32 s8, v0 +; FUNC2-NEXT: s_mul_i32 s8, s8, s1 +; FUNC2-NEXT: s_sub_i32 s7, s7, s8 +; FUNC2-NEXT: s_sub_i32 s8, s7, s1 +; FUNC2-NEXT: s_cmp_ge_u32 s7, s1 +; FUNC2-NEXT: s_cselect_b32 s7, s8, s7 +; FUNC2-NEXT: s_sub_i32 s8, s7, s1 +; FUNC2-NEXT: s_cmp_ge_u32 s7, s1 +; FUNC2-NEXT: s_cselect_b32 s7, s8, s7 +; FUNC2-NEXT: s_sub_i32 s1, 0, s0 +; FUNC2-NEXT: v_mul_lo_u32 v0, s1, v1 +; FUNC2-NEXT: v_readfirstlane_b32 s8, v3 +; FUNC2-NEXT: v_readfirstlane_b32 s1, v6 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v2, s8 +; FUNC2-NEXT: v_mul_hi_u32 v0, v1, v0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; FUNC2-NEXT: v_mul_hi_u32 v0, s1, v0 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_readfirstlane_b32 s9, v0 +; FUNC2-NEXT: s_mul_i32 s9, s9, s0 +; FUNC2-NEXT: s_sub_i32 s1, s1, s9 +; FUNC2-NEXT: s_sub_i32 s9, s1, s0 +; FUNC2-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC2-NEXT: s_cselect_b32 s1, s9, s1 +; FUNC2-NEXT: s_sub_i32 s9, s1, s0 +; FUNC2-NEXT: s_cmp_ge_u32 s1, s0 +; FUNC2-NEXT: s_cselect_b32 s9, s9, s1 +; FUNC2-NEXT: s_sub_i32 s0, 0, s8 +; FUNC2-NEXT: v_mul_lo_u32 v0, s0, v1 +; FUNC2-NEXT: s_mov_b32 s0, s4 +; FUNC2-NEXT: v_readfirstlane_b32 s4, v7 +; FUNC2-NEXT: s_mov_b32 s1, s5 +; FUNC2-NEXT: v_mul_hi_u32 v0, v1, v0 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; FUNC2-NEXT: v_mul_hi_u32 v2, s4, v0 +; FUNC2-NEXT: v_mov_b32_e32 v0, s6 +; FUNC2-NEXT: v_mov_b32_e32 v1, s7 +; FUNC2-NEXT: v_readfirstlane_b32 s5, v2 +; FUNC2-NEXT: s_mul_i32 s5, s5, s8 +; FUNC2-NEXT: s_sub_i32 s4, s4, s5 +; FUNC2-NEXT: s_sub_i32 s5, s4, s8 +; FUNC2-NEXT: s_cmp_ge_u32 s4, s8 +; FUNC2-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC2-NEXT: s_sub_i32 s5, s4, s8 +; FUNC2-NEXT: s_cmp_ge_u32 s4, s8 +; FUNC2-NEXT: s_cselect_b32 s4, s5, s4 +; FUNC2-NEXT: v_mov_b32_e32 v2, s9 +; FUNC2-NEXT: v_mov_b32_e32 v3, s4 +; FUNC2-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; FUNC2-NEXT: s_endpgm +; +; EG-LABEL: test_urem_v4i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 57, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: SUB_INT T2.W, 0.0, T1.Z, +; EG-NEXT: RECIP_UINT * T2.X, T1.Z, +; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, +; EG-NEXT: SUB_INT T2.W, 0.0, T1.Y, +; EG-NEXT: RECIP_UINT * T2.Z, T1.Y, +; EG-NEXT: MULLO_INT * T2.W, PV.W, PS, +; EG-NEXT: MULHI * T2.W, T2.Z, PS, +; EG-NEXT: ADD_INT T2.W, T2.Z, PS, +; EG-NEXT: MULHI * T2.Y, T2.X, T2.Y, +; EG-NEXT: ADD_INT T3.W, T2.X, PS, +; EG-NEXT: MULHI * T2.X, T0.Y, PV.W, +; EG-NEXT: MULHI * T2.Y, T0.Z, PV.W, +; EG-NEXT: SUB_INT T2.W, 0.0, T1.W, +; EG-NEXT: RECIP_UINT * T2.Z, T1.W, +; EG-NEXT: MULLO_INT * T2.W, PV.W, PS, +; EG-NEXT: MULHI * T2.W, T2.Z, PS, +; EG-NEXT: ADD_INT T2.W, T2.Z, PS, +; EG-NEXT: MULLO_INT * T2.Y, T2.Y, T1.Z, +; EG-NEXT: MULHI * T2.Z, T0.W, PV.W, +; EG-NEXT: MULLO_INT * T2.Z, PS, T1.W, +; EG-NEXT: SUB_INT T2.W, 0.0, T1.X, +; EG-NEXT: RECIP_UINT * T3.X, T1.X, +; EG-NEXT: MULLO_INT * T2.W, PV.W, PS, +; EG-NEXT: MULHI * T2.W, T3.X, PS, +; EG-NEXT: ADD_INT T3.Y, T3.X, PS, +; EG-NEXT: SUB_INT T2.Z, T0.W, T2.Z, +; EG-NEXT: SUB_INT T0.W, T0.Z, T2.Y, BS:VEC_021/SCL_122 +; EG-NEXT: MULLO_INT * T0.Z, T2.X, T1.Y, +; EG-NEXT: SETGE_UINT T2.X, PV.W, T1.Z, +; EG-NEXT: SUB_INT T0.Y, T0.Y, PS, +; EG-NEXT: SETGE_UINT T0.Z, PV.Z, T1.W, +; EG-NEXT: SUB_INT T2.W, PV.Z, T1.W, +; EG-NEXT: MULHI * T2.Y, T0.X, PV.Y, +; EG-NEXT: SUB_INT T3.X, T0.W, T1.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.Z, T2.Z, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T0.Z, PV.Y, T1.Y, +; EG-NEXT: SUB_INT T2.W, PV.Y, T1.Y, +; EG-NEXT: MULLO_INT * T2.Y, PS, T1.X, +; EG-NEXT: CNDE_INT T4.X, PV.Z, T0.Y, PV.W, +; EG-NEXT: SETGE_UINT T0.Y, PV.Y, T1.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T0.Z, PV.Y, T1.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T1.W, T0.X, PS, +; EG-NEXT: CNDE_INT * T0.W, T2.X, T0.W, PV.X, +; EG-NEXT: SETGE_UINT T0.X, PS, T1.Z, +; EG-NEXT: SUB_INT T2.Y, PS, T1.Z, +; EG-NEXT: SETGE_UINT T1.Z, PV.W, T1.X, +; EG-NEXT: SUB_INT T2.W, PV.W, T1.X, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.Y, PV.Z, +; EG-NEXT: CNDE_INT T0.Y, PV.Z, T1.W, PV.W, +; EG-NEXT: CNDE_INT T3.Z, PV.X, T0.W, PV.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T0.W, T4.X, T1.Y, +; EG-NEXT: SUB_INT * T1.W, T4.X, T1.Y, +; EG-NEXT: CNDE_INT T3.Y, PV.W, T4.X, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T0.W, PV.Y, T1.X, +; EG-NEXT: SUB_INT * T1.W, PV.Y, T1.X, +; EG-NEXT: CNDE_INT T3.X, PV.W, T0.Y, PS, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -57,10 +660,862 @@ ret void } -; FUNC-LABEL: {{^}}test_urem_i64: -; SI: s_endpgm -; EG: CF_END define amdgpu_kernel void @test_urem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_i64: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s3, 0xf000 +; FUNC1-NEXT: s_mov_b32 s2, -1 +; FUNC1-NEXT: v_mov_b32_e32 v4, 0 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s0, s6 +; FUNC1-NEXT: s_mov_b32 s1, s7 +; FUNC1-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_or_b32_e32 v5, v1, v3 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; FUNC1-NEXT: s_cbranch_vccz .LBB4_4 +; FUNC1-NEXT: ; %bb.1: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v4, v2 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v5, v3 +; FUNC1-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 +; FUNC1-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc +; FUNC1-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; FUNC1-NEXT: v_rcp_f32_e32 v4, v4 +; FUNC1-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; FUNC1-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; FUNC1-NEXT: v_trunc_f32_e32 v5, v5 +; FUNC1-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v5, v5 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v4, v4 +; FUNC1-NEXT: v_mul_lo_u32 v8, v6, v5 +; FUNC1-NEXT: v_mul_hi_u32 v9, v6, v4 +; FUNC1-NEXT: v_mul_lo_u32 v11, v7, v4 +; FUNC1-NEXT: v_mul_lo_u32 v10, v6, v4 +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; FUNC1-NEXT: v_mul_hi_u32 v9, v4, v10 +; FUNC1-NEXT: v_mul_lo_u32 v11, v4, v8 +; FUNC1-NEXT: v_mul_hi_u32 v12, v4, v8 +; FUNC1-NEXT: v_mul_hi_u32 v13, v5, v8 +; FUNC1-NEXT: v_mul_lo_u32 v8, v5, v8 +; FUNC1-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; FUNC1-NEXT: v_mul_lo_u32 v12, v5, v10 +; FUNC1-NEXT: v_mul_hi_u32 v10, v5, v10 +; FUNC1-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, v11, v10, vcc +; FUNC1-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; FUNC1-NEXT: v_mul_lo_u32 v8, v6, v5 +; FUNC1-NEXT: v_mul_hi_u32 v9, v6, v4 +; FUNC1-NEXT: v_mul_lo_u32 v7, v7, v4 +; FUNC1-NEXT: v_mul_lo_u32 v6, v6, v4 +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; FUNC1-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; FUNC1-NEXT: v_mul_lo_u32 v10, v4, v7 +; FUNC1-NEXT: v_mul_hi_u32 v11, v4, v6 +; FUNC1-NEXT: v_mul_hi_u32 v12, v4, v7 +; FUNC1-NEXT: v_mul_hi_u32 v9, v5, v6 +; FUNC1-NEXT: v_mul_lo_u32 v6, v5, v6 +; FUNC1-NEXT: v_mul_hi_u32 v8, v5, v7 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; FUNC1-NEXT: v_mul_lo_u32 v7, v5, v7 +; FUNC1-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; FUNC1-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc +; FUNC1-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; FUNC1-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; FUNC1-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; FUNC1-NEXT: v_mul_lo_u32 v6, v0, v5 +; FUNC1-NEXT: v_mul_hi_u32 v7, v0, v4 +; FUNC1-NEXT: v_mul_hi_u32 v8, v0, v5 +; FUNC1-NEXT: v_mul_hi_u32 v9, v1, v5 +; FUNC1-NEXT: v_mul_lo_u32 v5, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; FUNC1-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; FUNC1-NEXT: v_mul_lo_u32 v8, v1, v4 +; FUNC1-NEXT: v_mul_hi_u32 v4, v1, v4 +; FUNC1-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; FUNC1-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc +; FUNC1-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; FUNC1-NEXT: v_mul_lo_u32 v5, v2, v5 +; FUNC1-NEXT: v_mul_hi_u32 v6, v2, v4 +; FUNC1-NEXT: v_mul_lo_u32 v7, v3, v4 +; FUNC1-NEXT: v_mul_lo_u32 v4, v2, v4 +; FUNC1-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; FUNC1-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; FUNC1-NEXT: v_sub_i32_e32 v6, vcc, v1, v5 +; FUNC1-NEXT: v_sub_i32_e32 v7, vcc, v0, v4 +; FUNC1-NEXT: v_subb_u32_e64 v4, s[0:1], v6, v3, vcc +; FUNC1-NEXT: v_sub_i32_e64 v6, s[0:1], v7, v2 +; FUNC1-NEXT: v_subbrev_u32_e64 v8, s[2:3], 0, v4, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v8, v3 +; FUNC1-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v6, v2 +; FUNC1-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v8, v3 +; FUNC1-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v10, s[0:1], v6, v2 +; FUNC1-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; FUNC1-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v7, v2 +; FUNC1-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; FUNC1-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; FUNC1-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB4_3 +; FUNC1-NEXT: .LBB4_2: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, v2 +; FUNC1-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 +; FUNC1-NEXT: v_mov_b32_e32 v4, 0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_lo_u32 v3, v3, v1 +; FUNC1-NEXT: v_mul_hi_u32 v3, v1, v3 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; FUNC1-NEXT: v_mul_hi_u32 v1, v0, v1 +; FUNC1-NEXT: v_mul_lo_u32 v1, v1, v2 +; FUNC1-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; FUNC1-NEXT: .LBB4_3: +; FUNC1-NEXT: s_mov_b32 s7, 0xf000 +; FUNC1-NEXT: s_mov_b32 s6, -1 +; FUNC1-NEXT: buffer_store_dwordx2 v[3:4], off, s[4:7], 0 +; FUNC1-NEXT: s_endpgm +; FUNC1-NEXT: .LBB4_4: +; FUNC1-NEXT: ; implicit-def: $vgpr3_vgpr4 +; FUNC1-NEXT: s_branch .LBB4_2 +; +; FUNC2-LABEL: test_urem_i64: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s3, 0xf000 +; FUNC2-NEXT: s_mov_b32 s2, -1 +; FUNC2-NEXT: v_mov_b32_e32 v4, 0 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s0, s6 +; FUNC2-NEXT: s_mov_b32 s1, s7 +; FUNC2-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_or_b32_e32 v5, v1, v3 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; FUNC2-NEXT: s_cbranch_vccz .LBB4_4 +; FUNC2-NEXT: ; %bb.1: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v4, v2 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v5, v3 +; FUNC2-NEXT: v_sub_u32_e32 v10, vcc, 0, v2 +; FUNC2-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; FUNC2-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; FUNC2-NEXT: v_rcp_f32_e32 v4, v4 +; FUNC2-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; FUNC2-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; FUNC2-NEXT: v_trunc_f32_e32 v5, v5 +; FUNC2-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v8, v5 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v9, v4 +; FUNC2-NEXT: v_mul_lo_u32 v6, v10, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v9, 0 +; FUNC2-NEXT: v_mul_lo_u32 v7, v11, v9 +; FUNC2-NEXT: v_add_u32_e32 v5, vcc, v5, v6 +; FUNC2-NEXT: v_add_u32_e32 v7, vcc, v5, v7 +; FUNC2-NEXT: v_mul_hi_u32 v12, v9, v4 +; FUNC2-NEXT: v_mad_u64_u32 v[5:6], s[0:1], v9, v7, 0 +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v12, v5 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v8, v4, 0 +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v7, 0 +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v12, v4 +; FUNC2-NEXT: v_addc_u32_e32 v4, vcc, v13, v5, vcc +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v9, v4 +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, v8, v5, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v12, 0 +; FUNC2-NEXT: v_mul_lo_u32 v8, v10, v13 +; FUNC2-NEXT: v_mul_lo_u32 v9, v11, v12 +; FUNC2-NEXT: v_mul_hi_u32 v10, v12, v4 +; FUNC2-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v13, v4, 0 +; FUNC2-NEXT: v_add_u32_e32 v5, vcc, v5, v8 +; FUNC2-NEXT: v_add_u32_e32 v5, vcc, v5, v9 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v12, v5, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v13, v5, 0 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; FUNC2-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v6, vcc, v12, v4 +; FUNC2-NEXT: v_addc_u32_e32 v7, vcc, v13, v5, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v0, v7, 0 +; FUNC2-NEXT: v_mul_hi_u32 v8, v0, v6 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v8, v4 +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, v6, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v1, v7, 0 +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; FUNC2-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; FUNC2-NEXT: v_add_u32_e32 v6, vcc, v4, v6 +; FUNC2-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc +; FUNC2-NEXT: v_mul_lo_u32 v7, v2, v4 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0 +; FUNC2-NEXT: v_mul_lo_u32 v6, v3, v6 +; FUNC2-NEXT: v_add_u32_e32 v5, vcc, v5, v7 +; FUNC2-NEXT: v_add_u32_e32 v5, vcc, v5, v6 +; FUNC2-NEXT: v_sub_u32_e32 v6, vcc, v1, v5 +; FUNC2-NEXT: v_sub_u32_e32 v7, vcc, v0, v4 +; FUNC2-NEXT: v_subb_u32_e64 v4, s[0:1], v6, v3, vcc +; FUNC2-NEXT: v_sub_u32_e64 v6, s[0:1], v7, v2 +; FUNC2-NEXT: v_subbrev_u32_e64 v8, s[2:3], 0, v4, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v8, v3 +; FUNC2-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v6, v2 +; FUNC2-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v8, v3 +; FUNC2-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v10, s[0:1], v6, v2 +; FUNC2-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; FUNC2-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v7, v2 +; FUNC2-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; FUNC2-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; FUNC2-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB4_3 +; FUNC2-NEXT: .LBB4_2: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, v2 +; FUNC2-NEXT: v_sub_u32_e32 v3, vcc, 0, v2 +; FUNC2-NEXT: v_mov_b32_e32 v4, 0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_lo_u32 v3, v3, v1 +; FUNC2-NEXT: v_mul_hi_u32 v3, v1, v3 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; FUNC2-NEXT: v_mul_hi_u32 v1, v0, v1 +; FUNC2-NEXT: v_mul_lo_u32 v1, v1, v2 +; FUNC2-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; FUNC2-NEXT: .LBB4_3: +; FUNC2-NEXT: s_mov_b32 s7, 0xf000 +; FUNC2-NEXT: s_mov_b32 s6, -1 +; FUNC2-NEXT: buffer_store_dwordx2 v[3:4], off, s[4:7], 0 +; FUNC2-NEXT: s_endpgm +; FUNC2-NEXT: .LBB4_4: +; FUNC2-NEXT: ; implicit-def: $vgpr3_vgpr4 +; FUNC2-NEXT: s_branch .LBB4_2 +; +; EG-LABEL: test_urem_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @12 +; EG-NEXT: ALU_PUSH_BEFORE 6, @15, KC0[], KC1[] +; EG-NEXT: JUMP @9 POP:1 +; EG-NEXT: ALU 114, @22, KC0[], KC1[] +; EG-NEXT: ALU 115, @137, KC0[], KC1[] +; EG-NEXT: ALU 116, @253, KC0[], KC1[] +; EG-NEXT: ALU 115, @370, KC0[], KC1[] +; EG-NEXT: ALU_POP_AFTER 57, @486, KC0[], KC1[] +; EG-NEXT: ALU 18, @544, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XY, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 15: +; EG-NEXT: OR_INT * T1.W, T0.Y, T0.W, +; EG-NEXT: SUB_INT T1.Y, 0.0, T0.Z, +; EG-NEXT: MOV T1.Z, literal.x, +; EG-NEXT: SETNE_INT T1.W, PV.W, 0.0, +; EG-NEXT: RECIP_UINT * T1.X, T0.Z, +; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) +; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, +; EG-NEXT: ALU clause starting at 22: +; EG-NEXT: MULLO_INT * T1.Z, T1.Y, T1.X, +; EG-NEXT: MULHI * T1.Z, T1.X, PS, +; EG-NEXT: ADD_INT * T1.W, T1.X, PS, +; EG-NEXT: MULHI * T1.Z, T0.Y, PV.W, +; EG-NEXT: MULLO_INT * T1.Z, PS, T0.Z, +; EG-NEXT: SUB_INT * T1.W, T0.Y, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.Z, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.Z, +; EG-NEXT: CNDE_INT * T1.W, PV.W, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.Z, +; EG-NEXT: SUB_INT * T3.W, PV.W, T0.Z, +; EG-NEXT: CNDE_INT * T1.W, PV.W, T1.W, PS, +; EG-NEXT: CNDE_INT * T1.W, T0.W, PV.W, T0.Y, +; EG-NEXT: BIT_ALIGN_INT T2.W, PV.W, T0.X, literal.x, +; EG-NEXT: LSHR * T1.W, PV.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T1.Z, PS, T0.W, +; EG-NEXT: SETGE_UINT T3.W, PS, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.Z, +; EG-NEXT: CNDE_INT T0.Y, PV.Z, PV.W, PS, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, BS:VEC_201 +; EG-NEXT: SUB_INT T3.W, T1.W, T0.W, +; EG-NEXT: SUBB_UINT * T4.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT T3.W, PV.W, PS, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 29(4.063766e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: ALU clause starting at 137: +; EG-NEXT: CNDE_INT T0.Y, T3.W, T4.W, T1.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, BS:VEC_201 +; EG-NEXT: SUBB_UINT * T3.W, T2.W, T0.Z, BS:VEC_201 +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PV.W, T3.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T2.W, T0.Y, T2.W, T1.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 21(2.942727e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: ALU clause starting at 253: +; EG-NEXT: SUB_INT T3.W, T4.W, T3.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T2.W, T0.Y, T2.W, T1.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT * T4.W, T0.X, literal.x, 1, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 370: +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, T3.W, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.W, T2.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T2.W, T1.Z, T4.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 486: +; EG-NEXT: SETGE_UINT T1.Z, T2.W, T0.Z, +; EG-NEXT: SETE_INT T3.W, T1.W, T0.W, BS:VEC_210 +; EG-NEXT: SETGE_UINT * T4.W, T1.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: BFE_UINT T4.W, T0.X, 1, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T0.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T1.Z, T2.W, T0.Z, +; EG-NEXT: SUBB_UINT T3.W, T2.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, T1.W, T0.W, +; EG-NEXT: SUB_INT T3.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T2.W, PV.Y, T2.W, PV.Z, +; EG-NEXT: LSHL T1.Z, PS, 1, +; EG-NEXT: AND_INT T4.W, T0.X, 1, +; EG-NEXT: CNDE_INT * T1.W, T0.Y, T1.W, PV.W, +; EG-NEXT: BIT_ALIGN_INT T1.W, PS, T2.W, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.X, PS, T0.Z, +; EG-NEXT: SETE_INT T0.Y, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T1.Z, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.W, PS, T0.Z, +; EG-NEXT: SUB_INT * T0.W, PV.W, T0.W, +; EG-NEXT: SUB_INT T0.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, PV.Z, PV.X, +; EG-NEXT: CNDE_INT T2.Y, PS, T1.W, PV.W, +; EG-NEXT: SUB_INT * T0.W, T2.W, T0.Z, +; EG-NEXT: CNDE_INT T2.X, T3.W, T2.W, PV.W, +; EG-NEXT: MOV * T1.Z, literal.x, +; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 544: +; EG-NEXT: MOV T0.W, KC0[2].Y, +; EG-NEXT: SETE_INT * T1.W, T1.Z, 0.0, +; EG-NEXT: PRED_SETNE_INT * Pred,PredicateBit (MASKED), PS, 0.0, +; EG-NEXT: MULLO_INT * T0.Y, T1.Y, T1.X, Pred_sel_zero +; EG-NEXT: MULHI * T0.Y, T1.X, T0.Y, Pred_sel_zero +; EG-NEXT: ADD_INT * T1.W, T1.X, T0.Y, Pred_sel_zero +; EG-NEXT: MULHI * T0.Y, T0.X, T1.W, Pred_sel_zero +; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.Z, Pred_sel_zero +; EG-NEXT: SUB_INT * T1.W, T0.X, T0.Y, Pred_sel_zero +; EG-NEXT: SETGE_UINT T2.W, T1.W, T0.Z, Pred_sel_zero +; EG-NEXT: SUB_INT * T3.W, T1.W, T0.Z, Pred_sel_zero +; EG-NEXT: CNDE_INT * T1.W, T2.W, T1.W, T3.W, Pred_sel_zero +; EG-NEXT: SETGE_UINT T2.W, T1.W, T0.Z, Pred_sel_zero +; EG-NEXT: SUB_INT * T3.W, T1.W, T0.Z, Pred_sel_zero +; EG-NEXT: CNDE_INT T2.X, T2.W, T1.W, T3.W, Pred_sel_zero +; EG-NEXT: MOV * T2.Y, literal.x, Pred_sel_zero +; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; EG-NEXT: LSHR * T0.X, T0.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 %a = load i64, ptr addrspace(1) %in %b = load i64, ptr addrspace(1) %b_ptr @@ -69,10 +1524,1618 @@ ret void } -; FUNC-LABEL: {{^}}test_urem_v2i64: -; SI: s_endpgm -; EG: CF_END define amdgpu_kernel void @test_urem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_v2i64: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s3, 0xf000 +; FUNC1-NEXT: s_mov_b32 s2, -1 +; FUNC1-NEXT: v_mov_b32_e32 v8, 0 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s0, s6 +; FUNC1-NEXT: s_mov_b32 s1, s7 +; FUNC1-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 +; FUNC1-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_or_b32_e32 v9, v5, v1 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; FUNC1-NEXT: s_cbranch_vccz .LBB5_7 +; FUNC1-NEXT: ; %bb.1: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v8, v0 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v9, v1 +; FUNC1-NEXT: v_sub_i32_e32 v10, vcc, 0, v0 +; FUNC1-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; FUNC1-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; FUNC1-NEXT: v_rcp_f32_e32 v8, v8 +; FUNC1-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; FUNC1-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; FUNC1-NEXT: v_trunc_f32_e32 v9, v9 +; FUNC1-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v9, v9 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v8, v8 +; FUNC1-NEXT: v_mul_lo_u32 v12, v10, v9 +; FUNC1-NEXT: v_mul_hi_u32 v13, v10, v8 +; FUNC1-NEXT: v_mul_lo_u32 v15, v11, v8 +; FUNC1-NEXT: v_mul_lo_u32 v14, v10, v8 +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; FUNC1-NEXT: v_mul_hi_u32 v13, v8, v14 +; FUNC1-NEXT: v_mul_lo_u32 v15, v8, v12 +; FUNC1-NEXT: v_mul_hi_u32 v16, v8, v12 +; FUNC1-NEXT: v_mul_hi_u32 v17, v9, v12 +; FUNC1-NEXT: v_mul_lo_u32 v12, v9, v12 +; FUNC1-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; FUNC1-NEXT: v_mul_lo_u32 v16, v9, v14 +; FUNC1-NEXT: v_mul_hi_u32 v14, v9, v14 +; FUNC1-NEXT: v_add_i32_e32 v13, vcc, v13, v16 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, v15, v14, vcc +; FUNC1-NEXT: v_addc_u32_e32 v14, vcc, 0, v17, vcc +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc +; FUNC1-NEXT: v_mul_lo_u32 v12, v10, v9 +; FUNC1-NEXT: v_mul_hi_u32 v13, v10, v8 +; FUNC1-NEXT: v_mul_lo_u32 v11, v11, v8 +; FUNC1-NEXT: v_mul_lo_u32 v10, v10, v8 +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; FUNC1-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; FUNC1-NEXT: v_mul_lo_u32 v14, v8, v11 +; FUNC1-NEXT: v_mul_hi_u32 v15, v8, v10 +; FUNC1-NEXT: v_mul_hi_u32 v16, v8, v11 +; FUNC1-NEXT: v_mul_hi_u32 v13, v9, v10 +; FUNC1-NEXT: v_mul_lo_u32 v10, v9, v10 +; FUNC1-NEXT: v_mul_hi_u32 v12, v9, v11 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; FUNC1-NEXT: v_mul_lo_u32 v11, v9, v11 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; FUNC1-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc +; FUNC1-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc +; FUNC1-NEXT: v_mul_lo_u32 v10, v4, v9 +; FUNC1-NEXT: v_mul_hi_u32 v11, v4, v8 +; FUNC1-NEXT: v_mul_hi_u32 v12, v4, v9 +; FUNC1-NEXT: v_mul_hi_u32 v13, v5, v9 +; FUNC1-NEXT: v_mul_lo_u32 v9, v5, v9 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; FUNC1-NEXT: v_mul_lo_u32 v12, v5, v8 +; FUNC1-NEXT: v_mul_hi_u32 v8, v5, v8 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; FUNC1-NEXT: v_addc_u32_e32 v8, vcc, v11, v8, vcc +; FUNC1-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; FUNC1-NEXT: v_mul_lo_u32 v9, v0, v9 +; FUNC1-NEXT: v_mul_hi_u32 v10, v0, v8 +; FUNC1-NEXT: v_mul_lo_u32 v11, v1, v8 +; FUNC1-NEXT: v_mul_lo_u32 v8, v0, v8 +; FUNC1-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; FUNC1-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; FUNC1-NEXT: v_sub_i32_e32 v10, vcc, v5, v9 +; FUNC1-NEXT: v_sub_i32_e32 v8, vcc, v4, v8 +; FUNC1-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, vcc +; FUNC1-NEXT: v_sub_i32_e64 v11, s[0:1], v8, v0 +; FUNC1-NEXT: v_subbrev_u32_e64 v12, s[2:3], 0, v10, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v12, v1 +; FUNC1-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v11, v0 +; FUNC1-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v12, v1 +; FUNC1-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v14, s[0:1], v11, v0 +; FUNC1-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v10, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v13 +; FUNC1-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v8, v0 +; FUNC1-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; FUNC1-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v11, v11, v14, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; FUNC1-NEXT: v_cndmask_b32_e32 v9, v5, v10, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB5_3 +; FUNC1-NEXT: .LBB5_2: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, v0 +; FUNC1-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 +; FUNC1-NEXT: v_mov_b32_e32 v9, 0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_lo_u32 v5, v5, v1 +; FUNC1-NEXT: v_mul_hi_u32 v5, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; FUNC1-NEXT: v_mul_hi_u32 v1, v4, v1 +; FUNC1-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, v1, v0 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, v1, v0 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC1-NEXT: v_cndmask_b32_e32 v8, v1, v4, vcc +; FUNC1-NEXT: .LBB5_3: +; FUNC1-NEXT: v_or_b32_e32 v1, v7, v3 +; FUNC1-NEXT: v_mov_b32_e32 v0, 0 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; FUNC1-NEXT: s_cbranch_vccz .LBB5_8 +; FUNC1-NEXT: ; %bb.4: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, v3 +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; FUNC1-NEXT: v_subb_u32_e32 v5, vcc, 0, v3, vcc +; FUNC1-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; FUNC1-NEXT: v_rcp_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; FUNC1-NEXT: v_trunc_f32_e32 v1, v1 +; FUNC1-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_lo_u32 v10, v4, v1 +; FUNC1-NEXT: v_mul_hi_u32 v11, v4, v0 +; FUNC1-NEXT: v_mul_lo_u32 v13, v5, v0 +; FUNC1-NEXT: v_mul_lo_u32 v12, v4, v0 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; FUNC1-NEXT: v_mul_hi_u32 v11, v0, v12 +; FUNC1-NEXT: v_mul_lo_u32 v13, v0, v10 +; FUNC1-NEXT: v_mul_hi_u32 v14, v0, v10 +; FUNC1-NEXT: v_mul_hi_u32 v15, v1, v10 +; FUNC1-NEXT: v_mul_lo_u32 v10, v1, v10 +; FUNC1-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc +; FUNC1-NEXT: v_mul_lo_u32 v14, v1, v12 +; FUNC1-NEXT: v_mul_hi_u32 v12, v1, v12 +; FUNC1-NEXT: v_add_i32_e32 v11, vcc, v11, v14 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, v13, v12, vcc +; FUNC1-NEXT: v_addc_u32_e32 v12, vcc, 0, v15, vcc +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; FUNC1-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; FUNC1-NEXT: v_mul_lo_u32 v10, v4, v1 +; FUNC1-NEXT: v_mul_hi_u32 v11, v4, v0 +; FUNC1-NEXT: v_mul_lo_u32 v5, v5, v0 +; FUNC1-NEXT: v_mul_lo_u32 v4, v4, v0 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; FUNC1-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; FUNC1-NEXT: v_mul_lo_u32 v12, v0, v5 +; FUNC1-NEXT: v_mul_hi_u32 v13, v0, v4 +; FUNC1-NEXT: v_mul_hi_u32 v14, v0, v5 +; FUNC1-NEXT: v_mul_hi_u32 v11, v1, v4 +; FUNC1-NEXT: v_mul_lo_u32 v4, v1, v4 +; FUNC1-NEXT: v_mul_hi_u32 v10, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc +; FUNC1-NEXT: v_mul_lo_u32 v5, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; FUNC1-NEXT: v_addc_u32_e32 v4, vcc, v13, v11, vcc +; FUNC1-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, 0, v10, vcc +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; FUNC1-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; FUNC1-NEXT: v_mul_lo_u32 v4, v6, v1 +; FUNC1-NEXT: v_mul_hi_u32 v5, v6, v0 +; FUNC1-NEXT: v_mul_hi_u32 v10, v6, v1 +; FUNC1-NEXT: v_mul_hi_u32 v11, v7, v1 +; FUNC1-NEXT: v_mul_lo_u32 v1, v7, v1 +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, 0, v10, vcc +; FUNC1-NEXT: v_mul_lo_u32 v10, v7, v0 +; FUNC1-NEXT: v_mul_hi_u32 v0, v7, v0 +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; FUNC1-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc +; FUNC1-NEXT: v_addc_u32_e32 v4, vcc, 0, v11, vcc +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; FUNC1-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; FUNC1-NEXT: v_mul_lo_u32 v1, v2, v1 +; FUNC1-NEXT: v_mul_hi_u32 v4, v2, v0 +; FUNC1-NEXT: v_mul_lo_u32 v5, v3, v0 +; FUNC1-NEXT: v_mul_lo_u32 v0, v2, v0 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, v7, v1 +; FUNC1-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; FUNC1-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, vcc +; FUNC1-NEXT: v_sub_i32_e64 v5, s[0:1], v0, v2 +; FUNC1-NEXT: v_subbrev_u32_e64 v10, s[2:3], 0, v4, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v10, v3 +; FUNC1-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v2 +; FUNC1-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v10, v3 +; FUNC1-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v12, s[0:1], v5, v2 +; FUNC1-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; FUNC1-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; FUNC1-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; FUNC1-NEXT: v_cndmask_b32_e32 v11, v1, v4, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v10, v0, v5, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB5_6 +; FUNC1-NEXT: .LBB5_5: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; FUNC1-NEXT: v_mov_b32_e32 v11, 0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC1-NEXT: v_mul_hi_u32 v1, v0, v1 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; FUNC1-NEXT: v_mul_hi_u32 v0, v6, v0 +; FUNC1-NEXT: v_mul_lo_u32 v0, v0, v2 +; FUNC1-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc +; FUNC1-NEXT: .LBB5_6: +; FUNC1-NEXT: s_mov_b32 s7, 0xf000 +; FUNC1-NEXT: s_mov_b32 s6, -1 +; FUNC1-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; FUNC1-NEXT: s_endpgm +; FUNC1-NEXT: .LBB5_7: +; FUNC1-NEXT: ; implicit-def: $vgpr8_vgpr9 +; FUNC1-NEXT: s_branch .LBB5_2 +; FUNC1-NEXT: .LBB5_8: +; FUNC1-NEXT: s_branch .LBB5_5 +; +; FUNC2-LABEL: test_urem_v2i64: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s3, 0xf000 +; FUNC2-NEXT: s_mov_b32 s2, -1 +; FUNC2-NEXT: v_mov_b32_e32 v8, 0 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s0, s6 +; FUNC2-NEXT: s_mov_b32 s1, s7 +; FUNC2-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 +; FUNC2-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_or_b32_e32 v9, v5, v1 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; FUNC2-NEXT: s_cbranch_vccz .LBB5_7 +; FUNC2-NEXT: ; %bb.1: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v8, v0 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v9, v1 +; FUNC2-NEXT: v_sub_u32_e32 v14, vcc, 0, v0 +; FUNC2-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; FUNC2-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; FUNC2-NEXT: v_rcp_f32_e32 v8, v8 +; FUNC2-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; FUNC2-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; FUNC2-NEXT: v_trunc_f32_e32 v9, v9 +; FUNC2-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v12, v9 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v13, v8 +; FUNC2-NEXT: v_mul_lo_u32 v10, v14, v12 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v13, 0 +; FUNC2-NEXT: v_mul_lo_u32 v11, v15, v13 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; FUNC2-NEXT: v_add_u32_e32 v11, vcc, v9, v11 +; FUNC2-NEXT: v_mul_hi_u32 v16, v13, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v13, v11, 0 +; FUNC2-NEXT: v_add_u32_e32 v16, vcc, v16, v9 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v12, v8, 0 +; FUNC2-NEXT: v_addc_u32_e32 v17, vcc, 0, v10, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v11, 0 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v16, v8 +; FUNC2-NEXT: v_addc_u32_e32 v8, vcc, v17, v9, vcc +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v8, v10 +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v16, vcc, v13, v8 +; FUNC2-NEXT: v_addc_u32_e32 v17, vcc, v12, v9, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v16, 0 +; FUNC2-NEXT: v_mul_lo_u32 v12, v14, v17 +; FUNC2-NEXT: v_mul_lo_u32 v13, v15, v16 +; FUNC2-NEXT: v_mul_hi_u32 v14, v16, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v8, 0 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v12 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v13 +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v16, v9, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v17, v9, 0 +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v14, v12 +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v12, v10 +; FUNC2-NEXT: v_addc_u32_e32 v10, vcc, v13, v11, vcc +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v16, v8 +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, v17, v9, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0 +; FUNC2-NEXT: v_mul_hi_u32 v12, v4, v10 +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v12, v8 +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v10, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v11, 0 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v12, v8 +; FUNC2-NEXT: v_addc_u32_e32 v8, vcc, v13, v9, vcc +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v8, v10 +; FUNC2-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; FUNC2-NEXT: v_mul_lo_u32 v11, v0, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v10, 0 +; FUNC2-NEXT: v_mul_lo_u32 v10, v1, v10 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v11 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; FUNC2-NEXT: v_sub_u32_e32 v10, vcc, v5, v9 +; FUNC2-NEXT: v_sub_u32_e32 v8, vcc, v4, v8 +; FUNC2-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, vcc +; FUNC2-NEXT: v_sub_u32_e64 v11, s[0:1], v8, v0 +; FUNC2-NEXT: v_subbrev_u32_e64 v12, s[2:3], 0, v10, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v12, v1 +; FUNC2-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v11, v0 +; FUNC2-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v12, v1 +; FUNC2-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v14, s[0:1], v11, v0 +; FUNC2-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v10, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v13 +; FUNC2-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v8, v0 +; FUNC2-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; FUNC2-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v11, v11, v14, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; FUNC2-NEXT: v_cndmask_b32_e32 v9, v5, v10, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB5_3 +; FUNC2-NEXT: .LBB5_2: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, v0 +; FUNC2-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 +; FUNC2-NEXT: v_mov_b32_e32 v9, 0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_lo_u32 v5, v5, v1 +; FUNC2-NEXT: v_mul_hi_u32 v5, v1, v5 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; FUNC2-NEXT: v_mul_hi_u32 v1, v4, v1 +; FUNC2-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; FUNC2-NEXT: v_sub_u32_e32 v4, vcc, v1, v0 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; FUNC2-NEXT: v_sub_u32_e32 v4, vcc, v1, v0 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC2-NEXT: v_cndmask_b32_e32 v8, v1, v4, vcc +; FUNC2-NEXT: .LBB5_3: +; FUNC2-NEXT: v_or_b32_e32 v1, v7, v3 +; FUNC2-NEXT: v_mov_b32_e32 v0, 0 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; FUNC2-NEXT: s_cbranch_vccz .LBB5_8 +; FUNC2-NEXT: ; %bb.4: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, v3 +; FUNC2-NEXT: v_sub_u32_e32 v12, vcc, 0, v2 +; FUNC2-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; FUNC2-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; FUNC2-NEXT: v_rcp_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; FUNC2-NEXT: v_trunc_f32_e32 v1, v1 +; FUNC2-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v10, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v11, v0 +; FUNC2-NEXT: v_mul_lo_u32 v4, v12, v10 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v12, v11, 0 +; FUNC2-NEXT: v_mul_lo_u32 v5, v13, v11 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; FUNC2-NEXT: v_add_u32_e32 v15, vcc, v1, v5 +; FUNC2-NEXT: v_mul_hi_u32 v14, v11, v0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v11, v15, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v0, 0 +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v14, v4 +; FUNC2-NEXT: v_addc_u32_e32 v16, vcc, 0, v5, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v15, 0 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v14, v0 +; FUNC2-NEXT: v_addc_u32_e32 v0, vcc, v16, v1, vcc +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v11, v0 +; FUNC2-NEXT: v_addc_u32_e32 v15, vcc, v10, v1, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v12, v14, 0 +; FUNC2-NEXT: v_mul_lo_u32 v10, v12, v15 +; FUNC2-NEXT: v_mul_lo_u32 v11, v13, v14 +; FUNC2-NEXT: v_mul_hi_u32 v12, v14, v0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v15, v0, 0 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v10 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v11 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v14, v1, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v15, v1, 0 +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v12, v10 +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v10, v4 +; FUNC2-NEXT: v_addc_u32_e32 v4, vcc, v11, v5, vcc +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v14, v0 +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, v15, v1, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v5, 0 +; FUNC2-NEXT: v_mul_hi_u32 v10, v6, v4 +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v10, v0 +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v4, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v5, 0 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v10, v0 +; FUNC2-NEXT: v_addc_u32_e32 v0, vcc, v11, v1, vcc +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v0, v4 +; FUNC2-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; FUNC2-NEXT: v_mul_lo_u32 v5, v2, v0 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v4, 0 +; FUNC2-NEXT: v_mul_lo_u32 v4, v3, v4 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; FUNC2-NEXT: v_sub_u32_e32 v4, vcc, v7, v1 +; FUNC2-NEXT: v_sub_u32_e32 v0, vcc, v6, v0 +; FUNC2-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, vcc +; FUNC2-NEXT: v_sub_u32_e64 v5, s[0:1], v0, v2 +; FUNC2-NEXT: v_subbrev_u32_e64 v10, s[2:3], 0, v4, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v10, v3 +; FUNC2-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v2 +; FUNC2-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v10, v3 +; FUNC2-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v12, s[0:1], v5, v2 +; FUNC2-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; FUNC2-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; FUNC2-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; FUNC2-NEXT: v_cndmask_b32_e32 v11, v1, v4, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v10, v0, v5, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB5_6 +; FUNC2-NEXT: .LBB5_5: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 +; FUNC2-NEXT: v_mov_b32_e32 v11, 0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC2-NEXT: v_mul_hi_u32 v1, v0, v1 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FUNC2-NEXT: v_mul_hi_u32 v0, v6, v0 +; FUNC2-NEXT: v_mul_lo_u32 v0, v0, v2 +; FUNC2-NEXT: v_sub_u32_e32 v0, vcc, v6, v0 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc +; FUNC2-NEXT: .LBB5_6: +; FUNC2-NEXT: s_mov_b32 s7, 0xf000 +; FUNC2-NEXT: s_mov_b32 s6, -1 +; FUNC2-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; FUNC2-NEXT: s_endpgm +; FUNC2-NEXT: .LBB5_7: +; FUNC2-NEXT: ; implicit-def: $vgpr8_vgpr9 +; FUNC2-NEXT: s_branch .LBB5_2 +; FUNC2-NEXT: .LBB5_8: +; FUNC2-NEXT: s_branch .LBB5_5 +; +; EG-LABEL: test_urem_v2i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @14 +; EG-NEXT: ALU 113, @19, KC0[], KC1[] +; EG-NEXT: ALU 116, @133, KC0[], KC1[] +; EG-NEXT: ALU 115, @250, KC0[], KC1[] +; EG-NEXT: ALU 116, @366, KC0[], KC1[] +; EG-NEXT: ALU 115, @483, KC0[], KC1[] +; EG-NEXT: ALU 115, @599, KC0[], KC1[] +; EG-NEXT: ALU 115, @715, KC0[], KC1[] +; EG-NEXT: ALU 115, @831, KC0[], KC1[] +; EG-NEXT: ALU 115, @947, KC0[], KC1[] +; EG-NEXT: ALU 1, @1063, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_128 T0.XYZW, T1.X, 16, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 +; EG-NEXT: ALU clause starting at 18: +; EG-NEXT: MOV * T1.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 19: +; EG-NEXT: RECIP_UINT * T2.X, T0.X, +; EG-NEXT: SUB_INT T2.W, 0.0, T0.Z, +; EG-NEXT: RECIP_UINT * T2.Y, T0.Z, +; EG-NEXT: MULLO_INT * T2.Z, PV.W, PS, +; EG-NEXT: SUB_INT T2.W, 0.0, T0.X, +; EG-NEXT: MULHI * T2.Z, T2.Y, PS, +; EG-NEXT: ADD_INT T3.W, T2.Y, PS, +; EG-NEXT: MULLO_INT * T2.Y, PV.W, T2.X, +; EG-NEXT: MULHI * T2.Z, T1.W, PV.W, +; EG-NEXT: MULLO_INT * T2.Z, PS, T0.Z, +; EG-NEXT: SUB_INT T2.W, T1.W, PS, +; EG-NEXT: MULHI * T2.Y, T2.X, T2.Y, +; EG-NEXT: ADD_INT T2.Z, T2.X, PS, +; EG-NEXT: SETGE_UINT T3.W, PV.W, T0.Z, +; EG-NEXT: SUB_INT * T4.W, PV.W, T0.Z, +; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, +; EG-NEXT: MULHI * T2.X, T1.Y, PV.Z, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.Z, +; EG-NEXT: SUB_INT T3.W, PV.W, T0.Z, +; EG-NEXT: MULLO_INT * T2.X, PS, T0.X, +; EG-NEXT: CNDE_INT T2.W, PV.Z, T2.W, PV.W, +; EG-NEXT: SUB_INT * T3.W, T1.Y, PS, +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SUB_INT T4.W, PS, T0.X, +; EG-NEXT: CNDE_INT * T1.W, T0.W, PV.W, T1.W, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PS, T1.Z, literal.x, +; EG-NEXT: LSHR T1.W, PS, literal.x, +; EG-NEXT: CNDE_INT * T2.W, PV.Z, T3.W, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.X, PS, T0.X, +; EG-NEXT: SUB_INT T2.Y, PS, T0.X, +; EG-NEXT: SETE_INT T2.Z, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T3.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Z, +; EG-NEXT: CNDE_INT T2.Z, PV.Z, PV.W, PS, +; EG-NEXT: SUB_INT T3.W, T3.Z, T0.Z, +; EG-NEXT: CNDE_INT * T2.W, PV.X, T2.W, PV.Y, +; EG-NEXT: SUB_INT T2.Y, T1.W, T0.W, +; EG-NEXT: SUBB_UINT T4.Z, T3.Z, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T2.W, T0.Y, PS, T1.Y, +; EG-NEXT: CNDE_INT * T3.W, PV.Z, T3.Z, PV.W, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T1.X, literal.x, +; EG-NEXT: LSHR T2.W, PV.W, literal.x, +; EG-NEXT: SUB_INT * T4.W, PV.Y, PV.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T2.Y, T2.Z, T1.W, PS, BS:VEC_120/SCL_212 +; EG-NEXT: SETE_INT T2.Z, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT T1.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.X, +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, PS, +; EG-NEXT: SUB_INT T2.Z, T3.Z, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T2.X, T2.W, T0.Y, +; EG-NEXT: SUBB_UINT T1.Y, T3.Z, T0.X, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T4.W, PV.X, PV.Y, +; EG-NEXT: CNDE_INT * T5.W, T3.Y, T3.Z, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T3.W, T2.Y, T3.W, T4.Z, +; EG-NEXT: LSHL T3.Y, PV.W, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, T2.W, T5.W, literal.x, +; EG-NEXT: OR_INT T2.W, T1.Y, T2.Z, +; EG-NEXT: SUB_INT * T4.W, T2.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 29(4.063766e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 29(4.063766e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT * T2.Z, PV.W, T0.X, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 133: +; EG-NEXT: SETE_INT T1.W, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, T3.Z, T0.Y, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, T2.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, BS:VEC_021/SCL_122 +; EG-NEXT: BIT_ALIGN_INT T1.W, T1.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, T2.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT * T2.Z, T1.X, literal.x, 1, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 250: +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, T4.W, +; EG-NEXT: CNDE_INT * T3.W, T3.Y, T3.W, T4.Z, +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, T1.Y, T2.Z, +; EG-NEXT: SUB_INT * T4.W, T2.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 21(2.942727e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT * T1.Y, T3.Z, T0.Y, +; EG-NEXT: ALU clause starting at 366: +; EG-NEXT: SETGE_UINT T4.Z, T3.W, T0.Z, +; EG-NEXT: SETE_INT T4.W, T1.W, T0.W, BS:VEC_210 +; EG-NEXT: SETGE_UINT * T5.W, T1.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, T1.Y, T2.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 21(2.942727e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T2.X, T1.Z, literal.x, 1, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 483: +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, T4.W, +; EG-NEXT: SETGE_UINT T2.Z, T2.W, T0.X, +; EG-NEXT: SETE_INT T1.W, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, T3.Z, T0.Y, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, BS:VEC_021/SCL_122 +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, T2.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT * T2.X, T1.W, T0.W, +; EG-NEXT: ALU clause starting at 599: +; EG-NEXT: LSHL T1.Y, T2.W, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, T4.W, +; EG-NEXT: CNDE_INT * T3.W, T2.Y, T3.W, T4.Z, +; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, T2.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT * T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 715: +; EG-NEXT: OR_INT * T3.W, T2.Y, T2.X, +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T4.W, T1.W, T0.W, BS:VEC_210 +; EG-NEXT: SETGE_UINT * T5.W, T1.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT * T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 831: +; EG-NEXT: OR_INT T2.W, T1.Y, T2.Z, +; EG-NEXT: SUB_INT * T4.W, T2.X, T3.X, +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, T3.Z, T0.Y, BS:VEC_120/SCL_212 +; EG-NEXT: SETGE_UINT * T4.W, T3.Z, T0.Y, +; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT * T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: ALU clause starting at 947: +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, T1.Y, T2.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, T3.Y, T3.W, PV.Z, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T2.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T2.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T3.X, T3.W, T0.Z, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T3.Y, T2.W, T2.Z, +; EG-NEXT: SUB_INT T2.X, T1.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT T4.W, T3.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T2.W, literal.x, +; EG-NEXT: OR_INT T2.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T4.W, PV.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T2.X, T1.Z, 1, 1, +; EG-NEXT: CNDE_INT T1.Y, T2.Y, T1.W, PS, +; EG-NEXT: SETGE_UINT T2.Z, PV.W, T0.X, +; EG-NEXT: SETE_INT T1.W, PV.Z, T0.Y, +; EG-NEXT: SETGE_UINT * T4.W, PV.Z, T0.Y, +; EG-NEXT: CNDE_INT T2.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T2.W, T0.X, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T3.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T2.X, T2.W, T0.X, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.Y, +; EG-NEXT: SETGE_UINT T4.Z, PS, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.W, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Z, T3.W, T0.Z, +; EG-NEXT: SUB_INT T4.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T2.W, T2.Y, T2.W, T2.Z, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T2.Z, T1.X, 1, 1, +; EG-NEXT: CNDE_INT T4.W, T2.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T5.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.X, PS, 1, +; EG-NEXT: SUBB_UINT T2.Y, T3.W, T0.Z, +; EG-NEXT: SUB_INT * T3.Z, T1.W, T0.W, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T2.W, T4.W, T2.W, literal.x, +; EG-NEXT: OR_INT * T3.W, T1.Y, T2.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T3.X, T1.Z, 1, +; EG-NEXT: SETGE_UINT T1.Y, PS, T0.X, +; EG-NEXT: SETE_INT T1.Z, PV.W, T0.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT * T6.W, T3.Z, T2.Y, +; EG-NEXT: CNDE_INT T4.X, T3.Y, T1.W, PS, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T1.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T1.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T4.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T2.Y, PS, PV.W, +; EG-NEXT: CNDE_INT T1.Z, PV.Y, T3.W, PV.Z, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.X, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T3.W, T2.X, T3.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.X, PS, T0.Z, +; EG-NEXT: SETE_INT T3.Y, PV.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T2.Z, PV.Z, 1, +; EG-NEXT: AND_INT T4.W, T1.X, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.Y, +; EG-NEXT: SETGE_UINT T1.X, T1.W, T0.W, +; EG-NEXT: SUBB_UINT T1.Y, T3.W, T0.Z, BS:VEC_201 +; EG-NEXT: SUB_INT T3.Z, T1.W, T0.W, +; EG-NEXT: BIT_ALIGN_INT T0.W, PS, T1.Z, literal.x, +; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Y, PS, T0.X, +; EG-NEXT: SETE_INT T1.Z, PV.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PV.Z, PV.Y, +; EG-NEXT: CNDE_INT * T5.W, T3.Y, PV.X, T2.X, +; EG-NEXT: SETGE_UINT T1.X, T0.W, T0.Y, +; EG-NEXT: SUBB_UINT T1.Y, T2.W, T0.X, BS:VEC_120/SCL_212 +; EG-NEXT: SUB_INT T2.Z, T0.W, T0.Y, +; EG-NEXT: SUB_INT * T6.W, T3.W, T0.Z, BS:VEC_201 +; EG-NEXT: CNDE_INT * T4.W, T5.W, T1.W, T4.W, +; EG-NEXT: CNDE_INT T4.Z, T5.W, T3.W, T6.W, +; EG-NEXT: SUB_INT T1.W, T2.Z, T1.Y, +; EG-NEXT: CNDE_INT * T3.W, T1.Z, T1.X, T2.Y, +; EG-NEXT: CNDE_INT T4.Y, PS, T0.W, PV.W, +; EG-NEXT: SUB_INT * T0.W, T2.W, T0.X, +; EG-NEXT: CNDE_INT * T4.X, T3.W, T2.W, PV.W, +; EG-NEXT: ALU clause starting at 1063: +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1 %a = load <2 x i64>, ptr addrspace(1) %in %b = load <2 x i64>, ptr addrspace(1) %b_ptr @@ -81,10 +3144,3180 @@ ret void } -; FUNC-LABEL: {{^}}test_urem_v4i64: -; SI: s_endpgm -; EG: CF_END define amdgpu_kernel void @test_urem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; FUNC1-LABEL: test_urem_v4i64: +; FUNC1: ; %bb.0: +; FUNC1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; FUNC1-NEXT: s_mov_b32 s3, 0xf000 +; FUNC1-NEXT: s_mov_b32 s2, -1 +; FUNC1-NEXT: v_mov_b32_e32 v8, 0 +; FUNC1-NEXT: s_waitcnt lgkmcnt(0) +; FUNC1-NEXT: s_mov_b32 s0, s6 +; FUNC1-NEXT: s_mov_b32 s1, s7 +; FUNC1-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; FUNC1-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 +; FUNC1-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; FUNC1-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; FUNC1-NEXT: s_waitcnt vmcnt(2) +; FUNC1-NEXT: v_or_b32_e32 v9, v15, v11 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; FUNC1-NEXT: s_cbranch_vccz .LBB6_13 +; FUNC1-NEXT: ; %bb.1: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v8, v10 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v9, v11 +; FUNC1-NEXT: v_sub_i32_e32 v18, vcc, 0, v10 +; FUNC1-NEXT: v_subb_u32_e32 v19, vcc, 0, v11, vcc +; FUNC1-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; FUNC1-NEXT: v_rcp_f32_e32 v8, v8 +; FUNC1-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; FUNC1-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; FUNC1-NEXT: v_trunc_f32_e32 v9, v9 +; FUNC1-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v9, v9 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v8, v8 +; FUNC1-NEXT: v_mul_lo_u32 v20, v18, v9 +; FUNC1-NEXT: v_mul_hi_u32 v21, v18, v8 +; FUNC1-NEXT: v_mul_lo_u32 v23, v19, v8 +; FUNC1-NEXT: v_mul_lo_u32 v22, v18, v8 +; FUNC1-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; FUNC1-NEXT: v_add_i32_e32 v20, vcc, v20, v23 +; FUNC1-NEXT: v_mul_hi_u32 v21, v8, v22 +; FUNC1-NEXT: v_mul_lo_u32 v23, v8, v20 +; FUNC1-NEXT: v_mul_hi_u32 v24, v8, v20 +; FUNC1-NEXT: v_mul_hi_u32 v25, v9, v20 +; FUNC1-NEXT: v_mul_lo_u32 v20, v9, v20 +; FUNC1-NEXT: v_add_i32_e32 v21, vcc, v21, v23 +; FUNC1-NEXT: v_addc_u32_e32 v23, vcc, 0, v24, vcc +; FUNC1-NEXT: v_mul_lo_u32 v24, v9, v22 +; FUNC1-NEXT: v_mul_hi_u32 v22, v9, v22 +; FUNC1-NEXT: v_add_i32_e32 v21, vcc, v21, v24 +; FUNC1-NEXT: v_addc_u32_e32 v21, vcc, v23, v22, vcc +; FUNC1-NEXT: v_addc_u32_e32 v22, vcc, 0, v25, vcc +; FUNC1-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; FUNC1-NEXT: v_addc_u32_e32 v21, vcc, 0, v22, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v20 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, v9, v21, vcc +; FUNC1-NEXT: v_mul_lo_u32 v20, v18, v9 +; FUNC1-NEXT: v_mul_hi_u32 v21, v18, v8 +; FUNC1-NEXT: v_mul_lo_u32 v19, v19, v8 +; FUNC1-NEXT: v_mul_lo_u32 v18, v18, v8 +; FUNC1-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; FUNC1-NEXT: v_add_i32_e32 v19, vcc, v20, v19 +; FUNC1-NEXT: v_mul_lo_u32 v22, v8, v19 +; FUNC1-NEXT: v_mul_hi_u32 v23, v8, v18 +; FUNC1-NEXT: v_mul_hi_u32 v24, v8, v19 +; FUNC1-NEXT: v_mul_hi_u32 v21, v9, v18 +; FUNC1-NEXT: v_mul_lo_u32 v18, v9, v18 +; FUNC1-NEXT: v_mul_hi_u32 v20, v9, v19 +; FUNC1-NEXT: v_add_i32_e32 v22, vcc, v23, v22 +; FUNC1-NEXT: v_addc_u32_e32 v23, vcc, 0, v24, vcc +; FUNC1-NEXT: v_mul_lo_u32 v19, v9, v19 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v22, v18 +; FUNC1-NEXT: v_addc_u32_e32 v18, vcc, v23, v21, vcc +; FUNC1-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v18, v19 +; FUNC1-NEXT: v_addc_u32_e32 v19, vcc, 0, v20, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v18 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, v9, v19, vcc +; FUNC1-NEXT: v_mul_lo_u32 v18, v14, v9 +; FUNC1-NEXT: v_mul_hi_u32 v19, v14, v8 +; FUNC1-NEXT: v_mul_hi_u32 v20, v14, v9 +; FUNC1-NEXT: v_mul_hi_u32 v21, v15, v9 +; FUNC1-NEXT: v_mul_lo_u32 v9, v15, v9 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; FUNC1-NEXT: v_addc_u32_e32 v19, vcc, 0, v20, vcc +; FUNC1-NEXT: v_mul_lo_u32 v20, v15, v8 +; FUNC1-NEXT: v_mul_hi_u32 v8, v15, v8 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v18, v20 +; FUNC1-NEXT: v_addc_u32_e32 v8, vcc, v19, v8, vcc +; FUNC1-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; FUNC1-NEXT: v_addc_u32_e32 v9, vcc, 0, v18, vcc +; FUNC1-NEXT: v_mul_lo_u32 v9, v10, v9 +; FUNC1-NEXT: v_mul_hi_u32 v18, v10, v8 +; FUNC1-NEXT: v_mul_lo_u32 v19, v11, v8 +; FUNC1-NEXT: v_mul_lo_u32 v8, v10, v8 +; FUNC1-NEXT: v_add_i32_e32 v9, vcc, v18, v9 +; FUNC1-NEXT: v_add_i32_e32 v9, vcc, v9, v19 +; FUNC1-NEXT: v_sub_i32_e32 v18, vcc, v15, v9 +; FUNC1-NEXT: v_sub_i32_e32 v8, vcc, v14, v8 +; FUNC1-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, vcc +; FUNC1-NEXT: v_sub_i32_e64 v19, s[0:1], v8, v10 +; FUNC1-NEXT: v_subbrev_u32_e64 v20, s[2:3], 0, v18, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v11 +; FUNC1-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v10 +; FUNC1-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v20, v11 +; FUNC1-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v22, s[0:1], v19, v10 +; FUNC1-NEXT: v_subb_u32_e32 v9, vcc, v15, v9, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v18, s[0:1], 0, v18, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v9, v11 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 +; FUNC1-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; FUNC1-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v9, v11 +; FUNC1-NEXT: v_cndmask_b32_e32 v11, v15, v20, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v19, v19, v22, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; FUNC1-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v8, v8, v19, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB6_3 +; FUNC1-NEXT: .LBB6_2: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v8, v10 +; FUNC1-NEXT: v_sub_i32_e32 v9, vcc, 0, v10 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; FUNC1-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v8, v8 +; FUNC1-NEXT: v_mul_lo_u32 v9, v9, v8 +; FUNC1-NEXT: v_mul_hi_u32 v9, v8, v9 +; FUNC1-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; FUNC1-NEXT: v_mul_hi_u32 v8, v14, v8 +; FUNC1-NEXT: v_mul_lo_u32 v8, v8, v10 +; FUNC1-NEXT: v_sub_i32_e32 v8, vcc, v14, v8 +; FUNC1-NEXT: v_sub_i32_e32 v9, vcc, v8, v10 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; FUNC1-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; FUNC1-NEXT: v_sub_i32_e32 v9, vcc, v8, v10 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; FUNC1-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; FUNC1-NEXT: v_mov_b32_e32 v9, 0 +; FUNC1-NEXT: .LBB6_3: +; FUNC1-NEXT: v_or_b32_e32 v11, v17, v13 +; FUNC1-NEXT: v_mov_b32_e32 v10, 0 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; FUNC1-NEXT: s_cbranch_vccz .LBB6_14 +; FUNC1-NEXT: ; %bb.4: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v10, v12 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v11, v13 +; FUNC1-NEXT: v_sub_i32_e32 v14, vcc, 0, v12 +; FUNC1-NEXT: v_subb_u32_e32 v15, vcc, 0, v13, vcc +; FUNC1-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; FUNC1-NEXT: v_rcp_f32_e32 v10, v10 +; FUNC1-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; FUNC1-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 +; FUNC1-NEXT: v_trunc_f32_e32 v11, v11 +; FUNC1-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v11, v11 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v10, v10 +; FUNC1-NEXT: v_mul_lo_u32 v18, v14, v11 +; FUNC1-NEXT: v_mul_hi_u32 v19, v14, v10 +; FUNC1-NEXT: v_mul_lo_u32 v21, v15, v10 +; FUNC1-NEXT: v_mul_lo_u32 v20, v14, v10 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v18, v21 +; FUNC1-NEXT: v_mul_hi_u32 v19, v10, v20 +; FUNC1-NEXT: v_mul_lo_u32 v21, v10, v18 +; FUNC1-NEXT: v_mul_hi_u32 v22, v10, v18 +; FUNC1-NEXT: v_mul_hi_u32 v23, v11, v18 +; FUNC1-NEXT: v_mul_lo_u32 v18, v11, v18 +; FUNC1-NEXT: v_add_i32_e32 v19, vcc, v19, v21 +; FUNC1-NEXT: v_addc_u32_e32 v21, vcc, 0, v22, vcc +; FUNC1-NEXT: v_mul_lo_u32 v22, v11, v20 +; FUNC1-NEXT: v_mul_hi_u32 v20, v11, v20 +; FUNC1-NEXT: v_add_i32_e32 v19, vcc, v19, v22 +; FUNC1-NEXT: v_addc_u32_e32 v19, vcc, v21, v20, vcc +; FUNC1-NEXT: v_addc_u32_e32 v20, vcc, 0, v23, vcc +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; FUNC1-NEXT: v_addc_u32_e32 v19, vcc, 0, v20, vcc +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v18 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, v11, v19, vcc +; FUNC1-NEXT: v_mul_lo_u32 v18, v14, v11 +; FUNC1-NEXT: v_mul_hi_u32 v19, v14, v10 +; FUNC1-NEXT: v_mul_lo_u32 v15, v15, v10 +; FUNC1-NEXT: v_mul_lo_u32 v14, v14, v10 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; FUNC1-NEXT: v_add_i32_e32 v15, vcc, v18, v15 +; FUNC1-NEXT: v_mul_lo_u32 v20, v10, v15 +; FUNC1-NEXT: v_mul_hi_u32 v21, v10, v14 +; FUNC1-NEXT: v_mul_hi_u32 v22, v10, v15 +; FUNC1-NEXT: v_mul_hi_u32 v19, v11, v14 +; FUNC1-NEXT: v_mul_lo_u32 v14, v11, v14 +; FUNC1-NEXT: v_mul_hi_u32 v18, v11, v15 +; FUNC1-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; FUNC1-NEXT: v_addc_u32_e32 v21, vcc, 0, v22, vcc +; FUNC1-NEXT: v_mul_lo_u32 v15, v11, v15 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v20, v14 +; FUNC1-NEXT: v_addc_u32_e32 v14, vcc, v21, v19, vcc +; FUNC1-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v18, vcc +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, v11, v15, vcc +; FUNC1-NEXT: v_mul_lo_u32 v14, v16, v11 +; FUNC1-NEXT: v_mul_hi_u32 v15, v16, v10 +; FUNC1-NEXT: v_mul_hi_u32 v18, v16, v11 +; FUNC1-NEXT: v_mul_hi_u32 v19, v17, v11 +; FUNC1-NEXT: v_mul_lo_u32 v11, v17, v11 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v18, vcc +; FUNC1-NEXT: v_mul_lo_u32 v18, v17, v10 +; FUNC1-NEXT: v_mul_hi_u32 v10, v17, v10 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; FUNC1-NEXT: v_addc_u32_e32 v10, vcc, v15, v10, vcc +; FUNC1-NEXT: v_addc_u32_e32 v14, vcc, 0, v19, vcc +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; FUNC1-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc +; FUNC1-NEXT: v_mul_lo_u32 v11, v12, v11 +; FUNC1-NEXT: v_mul_hi_u32 v14, v12, v10 +; FUNC1-NEXT: v_mul_lo_u32 v15, v13, v10 +; FUNC1-NEXT: v_mul_lo_u32 v10, v12, v10 +; FUNC1-NEXT: v_add_i32_e32 v11, vcc, v14, v11 +; FUNC1-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; FUNC1-NEXT: v_sub_i32_e32 v14, vcc, v17, v11 +; FUNC1-NEXT: v_sub_i32_e32 v10, vcc, v16, v10 +; FUNC1-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v13, vcc +; FUNC1-NEXT: v_sub_i32_e64 v15, s[0:1], v10, v12 +; FUNC1-NEXT: v_subbrev_u32_e64 v18, s[2:3], 0, v14, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v13 +; FUNC1-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v12 +; FUNC1-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v13 +; FUNC1-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v13, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v19, v19, v20, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v20, s[0:1], v15, v12 +; FUNC1-NEXT: v_subb_u32_e32 v11, vcc, v17, v11, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v14, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v11, v13 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v19 +; FUNC1-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 +; FUNC1-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v11, v13 +; FUNC1-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v15, v15, v20, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; FUNC1-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v10, v10, v15, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB6_6 +; FUNC1-NEXT: .LBB6_5: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v10, v12 +; FUNC1-NEXT: v_sub_i32_e32 v11, vcc, 0, v12 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; FUNC1-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v10, v10 +; FUNC1-NEXT: v_mul_lo_u32 v11, v11, v10 +; FUNC1-NEXT: v_mul_hi_u32 v11, v10, v11 +; FUNC1-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; FUNC1-NEXT: v_mul_hi_u32 v10, v16, v10 +; FUNC1-NEXT: v_mul_lo_u32 v10, v10, v12 +; FUNC1-NEXT: v_sub_i32_e32 v10, vcc, v16, v10 +; FUNC1-NEXT: v_sub_i32_e32 v11, vcc, v10, v12 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 +; FUNC1-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; FUNC1-NEXT: v_sub_i32_e32 v11, vcc, v10, v12 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 +; FUNC1-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; FUNC1-NEXT: v_mov_b32_e32 v11, 0 +; FUNC1-NEXT: .LBB6_6: +; FUNC1-NEXT: s_waitcnt vmcnt(0) +; FUNC1-NEXT: v_or_b32_e32 v13, v5, v1 +; FUNC1-NEXT: v_mov_b32_e32 v12, 0 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; FUNC1-NEXT: s_cbranch_vccz .LBB6_15 +; FUNC1-NEXT: ; %bb.7: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v12, v0 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v13, v1 +; FUNC1-NEXT: v_sub_i32_e32 v14, vcc, 0, v0 +; FUNC1-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; FUNC1-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; FUNC1-NEXT: v_rcp_f32_e32 v12, v12 +; FUNC1-NEXT: v_mul_f32_e32 v12, 0x5f7ffffc, v12 +; FUNC1-NEXT: v_mul_f32_e32 v13, 0x2f800000, v12 +; FUNC1-NEXT: v_trunc_f32_e32 v13, v13 +; FUNC1-NEXT: v_mac_f32_e32 v12, 0xcf800000, v13 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v13, v13 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v12, v12 +; FUNC1-NEXT: v_mul_lo_u32 v16, v14, v13 +; FUNC1-NEXT: v_mul_hi_u32 v17, v14, v12 +; FUNC1-NEXT: v_mul_lo_u32 v19, v15, v12 +; FUNC1-NEXT: v_mul_lo_u32 v18, v14, v12 +; FUNC1-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; FUNC1-NEXT: v_add_i32_e32 v16, vcc, v16, v19 +; FUNC1-NEXT: v_mul_hi_u32 v17, v12, v18 +; FUNC1-NEXT: v_mul_lo_u32 v19, v12, v16 +; FUNC1-NEXT: v_mul_hi_u32 v20, v12, v16 +; FUNC1-NEXT: v_mul_hi_u32 v21, v13, v16 +; FUNC1-NEXT: v_mul_lo_u32 v16, v13, v16 +; FUNC1-NEXT: v_add_i32_e32 v17, vcc, v17, v19 +; FUNC1-NEXT: v_addc_u32_e32 v19, vcc, 0, v20, vcc +; FUNC1-NEXT: v_mul_lo_u32 v20, v13, v18 +; FUNC1-NEXT: v_mul_hi_u32 v18, v13, v18 +; FUNC1-NEXT: v_add_i32_e32 v17, vcc, v17, v20 +; FUNC1-NEXT: v_addc_u32_e32 v17, vcc, v19, v18, vcc +; FUNC1-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc +; FUNC1-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; FUNC1-NEXT: v_addc_u32_e32 v17, vcc, 0, v18, vcc +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v12, v16 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, v13, v17, vcc +; FUNC1-NEXT: v_mul_lo_u32 v16, v14, v13 +; FUNC1-NEXT: v_mul_hi_u32 v17, v14, v12 +; FUNC1-NEXT: v_mul_lo_u32 v15, v15, v12 +; FUNC1-NEXT: v_mul_lo_u32 v14, v14, v12 +; FUNC1-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; FUNC1-NEXT: v_add_i32_e32 v15, vcc, v16, v15 +; FUNC1-NEXT: v_mul_lo_u32 v18, v12, v15 +; FUNC1-NEXT: v_mul_hi_u32 v19, v12, v14 +; FUNC1-NEXT: v_mul_hi_u32 v20, v12, v15 +; FUNC1-NEXT: v_mul_hi_u32 v17, v13, v14 +; FUNC1-NEXT: v_mul_lo_u32 v14, v13, v14 +; FUNC1-NEXT: v_mul_hi_u32 v16, v13, v15 +; FUNC1-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; FUNC1-NEXT: v_addc_u32_e32 v19, vcc, 0, v20, vcc +; FUNC1-NEXT: v_mul_lo_u32 v15, v13, v15 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v18, v14 +; FUNC1-NEXT: v_addc_u32_e32 v14, vcc, v19, v17, vcc +; FUNC1-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, v13, v15, vcc +; FUNC1-NEXT: v_mul_lo_u32 v14, v4, v13 +; FUNC1-NEXT: v_mul_hi_u32 v15, v4, v12 +; FUNC1-NEXT: v_mul_hi_u32 v16, v4, v13 +; FUNC1-NEXT: v_mul_hi_u32 v17, v5, v13 +; FUNC1-NEXT: v_mul_lo_u32 v13, v5, v13 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; FUNC1-NEXT: v_mul_lo_u32 v16, v5, v12 +; FUNC1-NEXT: v_mul_hi_u32 v12, v5, v12 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v14, v16 +; FUNC1-NEXT: v_addc_u32_e32 v12, vcc, v15, v12, vcc +; FUNC1-NEXT: v_addc_u32_e32 v14, vcc, 0, v17, vcc +; FUNC1-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; FUNC1-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc +; FUNC1-NEXT: v_mul_lo_u32 v13, v0, v13 +; FUNC1-NEXT: v_mul_hi_u32 v14, v0, v12 +; FUNC1-NEXT: v_mul_lo_u32 v15, v1, v12 +; FUNC1-NEXT: v_mul_lo_u32 v12, v0, v12 +; FUNC1-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; FUNC1-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; FUNC1-NEXT: v_sub_i32_e32 v14, vcc, v5, v13 +; FUNC1-NEXT: v_sub_i32_e32 v12, vcc, v4, v12 +; FUNC1-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v1, vcc +; FUNC1-NEXT: v_sub_i32_e64 v15, s[0:1], v12, v0 +; FUNC1-NEXT: v_subbrev_u32_e64 v16, s[2:3], 0, v14, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v16, v1 +; FUNC1-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v0 +; FUNC1-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v16, v1 +; FUNC1-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v1, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v18, s[0:1], v15, v0 +; FUNC1-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v14, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 +; FUNC1-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v12, v0 +; FUNC1-NEXT: v_cndmask_b32_e64 v14, v16, v14, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; FUNC1-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v15, v15, v18, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; FUNC1-NEXT: v_cndmask_b32_e32 v13, v5, v14, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB6_9 +; FUNC1-NEXT: .LBB6_8: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, v0 +; FUNC1-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 +; FUNC1-NEXT: v_mov_b32_e32 v13, 0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_mul_lo_u32 v5, v5, v1 +; FUNC1-NEXT: v_mul_hi_u32 v5, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; FUNC1-NEXT: v_mul_hi_u32 v1, v4, v1 +; FUNC1-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, v1, v0 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, v1, v0 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC1-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc +; FUNC1-NEXT: .LBB6_9: +; FUNC1-NEXT: v_or_b32_e32 v1, v7, v3 +; FUNC1-NEXT: v_mov_b32_e32 v0, 0 +; FUNC1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; FUNC1-NEXT: s_cbranch_vccz .LBB6_16 +; FUNC1-NEXT: ; %bb.10: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC1-NEXT: v_cvt_f32_u32_e32 v1, v3 +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; FUNC1-NEXT: v_subb_u32_e32 v5, vcc, 0, v3, vcc +; FUNC1-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; FUNC1-NEXT: v_rcp_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; FUNC1-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; FUNC1-NEXT: v_trunc_f32_e32 v1, v1 +; FUNC1-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_lo_u32 v14, v4, v1 +; FUNC1-NEXT: v_mul_hi_u32 v15, v4, v0 +; FUNC1-NEXT: v_mul_lo_u32 v17, v5, v0 +; FUNC1-NEXT: v_mul_lo_u32 v16, v4, v0 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; FUNC1-NEXT: v_mul_hi_u32 v15, v0, v16 +; FUNC1-NEXT: v_mul_lo_u32 v17, v0, v14 +; FUNC1-NEXT: v_mul_hi_u32 v18, v0, v14 +; FUNC1-NEXT: v_mul_hi_u32 v19, v1, v14 +; FUNC1-NEXT: v_mul_lo_u32 v14, v1, v14 +; FUNC1-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; FUNC1-NEXT: v_addc_u32_e32 v17, vcc, 0, v18, vcc +; FUNC1-NEXT: v_mul_lo_u32 v18, v1, v16 +; FUNC1-NEXT: v_mul_hi_u32 v16, v1, v16 +; FUNC1-NEXT: v_add_i32_e32 v15, vcc, v15, v18 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, v17, v16, vcc +; FUNC1-NEXT: v_addc_u32_e32 v16, vcc, 0, v19, vcc +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; FUNC1-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v14 +; FUNC1-NEXT: v_addc_u32_e32 v1, vcc, v1, v15, vcc +; FUNC1-NEXT: v_mul_lo_u32 v14, v4, v1 +; FUNC1-NEXT: v_mul_hi_u32 v15, v4, v0 +; FUNC1-NEXT: v_mul_lo_u32 v5, v5, v0 +; FUNC1-NEXT: v_mul_lo_u32 v4, v4, v0 +; FUNC1-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; FUNC1-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; FUNC1-NEXT: v_mul_lo_u32 v16, v0, v5 +; FUNC1-NEXT: v_mul_hi_u32 v17, v0, v4 +; FUNC1-NEXT: v_mul_hi_u32 v18, v0, v5 +; FUNC1-NEXT: v_mul_hi_u32 v15, v1, v4 +; FUNC1-NEXT: v_mul_lo_u32 v4, v1, v4 +; FUNC1-NEXT: v_mul_hi_u32 v14, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; FUNC1-NEXT: v_addc_u32_e32 v17, vcc, 0, v18, vcc +; FUNC1-NEXT: v_mul_lo_u32 v5, v1, v5 +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v16, v4 +; FUNC1-NEXT: v_addc_u32_e32 v4, vcc, v17, v15, vcc +; FUNC1-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; FUNC1-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; FUNC1-NEXT: v_mul_lo_u32 v4, v6, v1 +; FUNC1-NEXT: v_mul_hi_u32 v5, v6, v0 +; FUNC1-NEXT: v_mul_hi_u32 v14, v6, v1 +; FUNC1-NEXT: v_mul_hi_u32 v15, v7, v1 +; FUNC1-NEXT: v_mul_lo_u32 v1, v7, v1 +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; FUNC1-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc +; FUNC1-NEXT: v_mul_lo_u32 v14, v7, v0 +; FUNC1-NEXT: v_mul_hi_u32 v0, v7, v0 +; FUNC1-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; FUNC1-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc +; FUNC1-NEXT: v_addc_u32_e32 v4, vcc, 0, v15, vcc +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; FUNC1-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; FUNC1-NEXT: v_mul_lo_u32 v1, v2, v1 +; FUNC1-NEXT: v_mul_hi_u32 v4, v2, v0 +; FUNC1-NEXT: v_mul_lo_u32 v5, v3, v0 +; FUNC1-NEXT: v_mul_lo_u32 v0, v2, v0 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; FUNC1-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; FUNC1-NEXT: v_sub_i32_e32 v4, vcc, v7, v1 +; FUNC1-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; FUNC1-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, vcc +; FUNC1-NEXT: v_sub_i32_e64 v5, s[0:1], v0, v2 +; FUNC1-NEXT: v_subbrev_u32_e64 v14, s[2:3], 0, v4, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v3 +; FUNC1-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v2 +; FUNC1-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; FUNC1-NEXT: v_cmp_eq_u32_e64 s[2:3], v14, v3 +; FUNC1-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; FUNC1-NEXT: v_sub_i32_e64 v16, s[0:1], v5, v2 +; FUNC1-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc +; FUNC1-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; FUNC1-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; FUNC1-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[0:1] +; FUNC1-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; FUNC1-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; FUNC1-NEXT: v_cndmask_b32_e32 v3, v7, v14, vcc +; FUNC1-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[0:1] +; FUNC1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; FUNC1-NEXT: v_cndmask_b32_e32 v15, v1, v4, vcc +; FUNC1-NEXT: v_cndmask_b32_e32 v14, v0, v5, vcc +; FUNC1-NEXT: s_cbranch_execnz .LBB6_12 +; FUNC1-NEXT: .LBB6_11: +; FUNC1-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; FUNC1-NEXT: v_mov_b32_e32 v15, 0 +; FUNC1-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC1-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC1-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC1-NEXT: v_mul_hi_u32 v1, v0, v1 +; FUNC1-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; FUNC1-NEXT: v_mul_hi_u32 v0, v6, v0 +; FUNC1-NEXT: v_mul_lo_u32 v0, v0, v2 +; FUNC1-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; FUNC1-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; FUNC1-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC1-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc +; FUNC1-NEXT: .LBB6_12: +; FUNC1-NEXT: s_mov_b32 s7, 0xf000 +; FUNC1-NEXT: s_mov_b32 s6, -1 +; FUNC1-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; FUNC1-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; FUNC1-NEXT: s_endpgm +; FUNC1-NEXT: .LBB6_13: +; FUNC1-NEXT: ; implicit-def: $vgpr8_vgpr9 +; FUNC1-NEXT: s_branch .LBB6_2 +; FUNC1-NEXT: .LBB6_14: +; FUNC1-NEXT: s_branch .LBB6_5 +; FUNC1-NEXT: .LBB6_15: +; FUNC1-NEXT: ; implicit-def: $vgpr12_vgpr13 +; FUNC1-NEXT: s_branch .LBB6_8 +; FUNC1-NEXT: .LBB6_16: +; FUNC1-NEXT: s_branch .LBB6_11 +; +; FUNC2-LABEL: test_urem_v4i64: +; FUNC2: ; %bb.0: +; FUNC2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FUNC2-NEXT: s_mov_b32 s3, 0xf000 +; FUNC2-NEXT: s_mov_b32 s2, -1 +; FUNC2-NEXT: v_mov_b32_e32 v8, 0 +; FUNC2-NEXT: s_waitcnt lgkmcnt(0) +; FUNC2-NEXT: s_mov_b32 s0, s6 +; FUNC2-NEXT: s_mov_b32 s1, s7 +; FUNC2-NEXT: buffer_load_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; FUNC2-NEXT: buffer_load_dwordx4 v[14:17], off, s[0:3], 0 +; FUNC2-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; FUNC2-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; FUNC2-NEXT: s_waitcnt vmcnt(2) +; FUNC2-NEXT: v_or_b32_e32 v9, v15, v11 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; FUNC2-NEXT: s_cbranch_vccz .LBB6_13 +; FUNC2-NEXT: ; %bb.1: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v8, v10 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v9, v11 +; FUNC2-NEXT: v_sub_u32_e32 v22, vcc, 0, v10 +; FUNC2-NEXT: v_subb_u32_e32 v23, vcc, 0, v11, vcc +; FUNC2-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 +; FUNC2-NEXT: v_rcp_f32_e32 v8, v8 +; FUNC2-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; FUNC2-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; FUNC2-NEXT: v_trunc_f32_e32 v9, v9 +; FUNC2-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v20, v9 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v21, v8 +; FUNC2-NEXT: v_mul_lo_u32 v18, v22, v20 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v22, v21, 0 +; FUNC2-NEXT: v_mul_lo_u32 v19, v23, v21 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v18 +; FUNC2-NEXT: v_add_u32_e32 v25, vcc, v9, v19 +; FUNC2-NEXT: v_mul_hi_u32 v24, v21, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v21, v25, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v20, v8, 0 +; FUNC2-NEXT: v_add_u32_e32 v24, vcc, v24, v18 +; FUNC2-NEXT: v_addc_u32_e32 v26, vcc, 0, v19, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v25, 0 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v24, v8 +; FUNC2-NEXT: v_addc_u32_e32 v8, vcc, v26, v9, vcc +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v8, v18 +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v24, vcc, v21, v8 +; FUNC2-NEXT: v_addc_u32_e32 v25, vcc, v20, v9, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v22, v24, 0 +; FUNC2-NEXT: v_mul_lo_u32 v20, v22, v25 +; FUNC2-NEXT: v_mul_lo_u32 v21, v23, v24 +; FUNC2-NEXT: v_mul_hi_u32 v22, v24, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v8, 0 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v20 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v21 +; FUNC2-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v24, v9, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v25, v9, 0 +; FUNC2-NEXT: v_add_u32_e32 v20, vcc, v22, v20 +; FUNC2-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v20, v18 +; FUNC2-NEXT: v_addc_u32_e32 v18, vcc, v21, v19, vcc +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v18, v8 +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v24, v8 +; FUNC2-NEXT: v_addc_u32_e32 v19, vcc, v25, v9, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v19, 0 +; FUNC2-NEXT: v_mul_hi_u32 v20, v14, v18 +; FUNC2-NEXT: v_add_u32_e32 v20, vcc, v20, v8 +; FUNC2-NEXT: v_addc_u32_e32 v21, vcc, 0, v9, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v18, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v19, 0 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v20, v8 +; FUNC2-NEXT: v_addc_u32_e32 v8, vcc, v21, v9, vcc +; FUNC2-NEXT: v_addc_u32_e32 v9, vcc, 0, v19, vcc +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v8, v18 +; FUNC2-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; FUNC2-NEXT: v_mul_lo_u32 v19, v10, v8 +; FUNC2-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v18, 0 +; FUNC2-NEXT: v_mul_lo_u32 v18, v11, v18 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v19 +; FUNC2-NEXT: v_add_u32_e32 v9, vcc, v9, v18 +; FUNC2-NEXT: v_sub_u32_e32 v18, vcc, v15, v9 +; FUNC2-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 +; FUNC2-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, vcc +; FUNC2-NEXT: v_sub_u32_e64 v19, s[0:1], v8, v10 +; FUNC2-NEXT: v_subbrev_u32_e64 v20, s[2:3], 0, v18, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v11 +; FUNC2-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v10 +; FUNC2-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v20, v11 +; FUNC2-NEXT: v_subb_u32_e64 v18, s[0:1], v18, v11, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v22, s[0:1], v19, v10 +; FUNC2-NEXT: v_subb_u32_e32 v9, vcc, v15, v9, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v18, s[0:1], 0, v18, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v9, v11 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 +; FUNC2-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; FUNC2-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v9, v11 +; FUNC2-NEXT: v_cndmask_b32_e32 v11, v15, v20, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v19, v19, v22, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; FUNC2-NEXT: v_cndmask_b32_e32 v9, v9, v18, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v8, v8, v19, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB6_3 +; FUNC2-NEXT: .LBB6_2: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v8, v10 +; FUNC2-NEXT: v_sub_u32_e32 v9, vcc, 0, v10 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; FUNC2-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v8, v8 +; FUNC2-NEXT: v_mul_lo_u32 v9, v9, v8 +; FUNC2-NEXT: v_mul_hi_u32 v9, v8, v9 +; FUNC2-NEXT: v_add_u32_e32 v8, vcc, v8, v9 +; FUNC2-NEXT: v_mul_hi_u32 v8, v14, v8 +; FUNC2-NEXT: v_mul_lo_u32 v8, v8, v10 +; FUNC2-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 +; FUNC2-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; FUNC2-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; FUNC2-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; FUNC2-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc +; FUNC2-NEXT: v_mov_b32_e32 v9, 0 +; FUNC2-NEXT: .LBB6_3: +; FUNC2-NEXT: v_or_b32_e32 v11, v17, v13 +; FUNC2-NEXT: v_mov_b32_e32 v10, 0 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; FUNC2-NEXT: s_cbranch_vccz .LBB6_14 +; FUNC2-NEXT: ; %bb.4: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v10, v12 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v11, v13 +; FUNC2-NEXT: v_sub_u32_e32 v20, vcc, 0, v12 +; FUNC2-NEXT: v_subb_u32_e32 v21, vcc, 0, v13, vcc +; FUNC2-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 +; FUNC2-NEXT: v_rcp_f32_e32 v10, v10 +; FUNC2-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 +; FUNC2-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 +; FUNC2-NEXT: v_trunc_f32_e32 v11, v11 +; FUNC2-NEXT: v_mac_f32_e32 v10, 0xcf800000, v11 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v18, v11 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v19, v10 +; FUNC2-NEXT: v_mul_lo_u32 v14, v20, v18 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v20, v19, 0 +; FUNC2-NEXT: v_mul_lo_u32 v15, v21, v19 +; FUNC2-NEXT: v_add_u32_e32 v11, vcc, v11, v14 +; FUNC2-NEXT: v_add_u32_e32 v23, vcc, v11, v15 +; FUNC2-NEXT: v_mul_hi_u32 v22, v19, v10 +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v19, v23, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v10, 0 +; FUNC2-NEXT: v_add_u32_e32 v22, vcc, v22, v14 +; FUNC2-NEXT: v_addc_u32_e32 v24, vcc, 0, v15, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v18, v23, 0 +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v22, v10 +; FUNC2-NEXT: v_addc_u32_e32 v10, vcc, v24, v11, vcc +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v10, v14 +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; FUNC2-NEXT: v_add_u32_e32 v22, vcc, v19, v10 +; FUNC2-NEXT: v_addc_u32_e32 v23, vcc, v18, v11, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v20, v22, 0 +; FUNC2-NEXT: v_mul_lo_u32 v18, v20, v23 +; FUNC2-NEXT: v_mul_lo_u32 v19, v21, v22 +; FUNC2-NEXT: v_mul_hi_u32 v20, v22, v10 +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v23, v10, 0 +; FUNC2-NEXT: v_add_u32_e32 v11, vcc, v11, v18 +; FUNC2-NEXT: v_add_u32_e32 v11, vcc, v11, v19 +; FUNC2-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v11, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v23, v11, 0 +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v20, v18 +; FUNC2-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v18, v14 +; FUNC2-NEXT: v_addc_u32_e32 v14, vcc, v19, v15, vcc +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v14, v10 +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v22, v10 +; FUNC2-NEXT: v_addc_u32_e32 v15, vcc, v23, v11, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v16, v15, 0 +; FUNC2-NEXT: v_mul_hi_u32 v18, v16, v14 +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v18, v10 +; FUNC2-NEXT: v_addc_u32_e32 v19, vcc, 0, v11, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v14, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v17, v15, 0 +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v18, v10 +; FUNC2-NEXT: v_addc_u32_e32 v10, vcc, v19, v11, vcc +; FUNC2-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v10, v14 +; FUNC2-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc +; FUNC2-NEXT: v_mul_lo_u32 v15, v12, v10 +; FUNC2-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v14, 0 +; FUNC2-NEXT: v_mul_lo_u32 v14, v13, v14 +; FUNC2-NEXT: v_add_u32_e32 v11, vcc, v11, v15 +; FUNC2-NEXT: v_add_u32_e32 v11, vcc, v11, v14 +; FUNC2-NEXT: v_sub_u32_e32 v14, vcc, v17, v11 +; FUNC2-NEXT: v_sub_u32_e32 v10, vcc, v16, v10 +; FUNC2-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v13, vcc +; FUNC2-NEXT: v_sub_u32_e64 v15, s[0:1], v10, v12 +; FUNC2-NEXT: v_subbrev_u32_e64 v18, s[2:3], 0, v14, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v13 +; FUNC2-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v12 +; FUNC2-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v13 +; FUNC2-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v13, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v19, v19, v20, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v20, s[0:1], v15, v12 +; FUNC2-NEXT: v_subb_u32_e32 v11, vcc, v17, v11, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v14, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v11, v13 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v19 +; FUNC2-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 +; FUNC2-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v11, v13 +; FUNC2-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v15, v15, v20, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; FUNC2-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v10, v10, v15, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB6_6 +; FUNC2-NEXT: .LBB6_5: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v10, v12 +; FUNC2-NEXT: v_sub_u32_e32 v11, vcc, 0, v12 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v10, v10 +; FUNC2-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v10, v10 +; FUNC2-NEXT: v_mul_lo_u32 v11, v11, v10 +; FUNC2-NEXT: v_mul_hi_u32 v11, v10, v11 +; FUNC2-NEXT: v_add_u32_e32 v10, vcc, v10, v11 +; FUNC2-NEXT: v_mul_hi_u32 v10, v16, v10 +; FUNC2-NEXT: v_mul_lo_u32 v10, v10, v12 +; FUNC2-NEXT: v_sub_u32_e32 v10, vcc, v16, v10 +; FUNC2-NEXT: v_sub_u32_e32 v11, vcc, v10, v12 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 +; FUNC2-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; FUNC2-NEXT: v_sub_u32_e32 v11, vcc, v10, v12 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 +; FUNC2-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; FUNC2-NEXT: v_mov_b32_e32 v11, 0 +; FUNC2-NEXT: .LBB6_6: +; FUNC2-NEXT: s_waitcnt vmcnt(0) +; FUNC2-NEXT: v_or_b32_e32 v13, v5, v1 +; FUNC2-NEXT: v_mov_b32_e32 v12, 0 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; FUNC2-NEXT: s_cbranch_vccz .LBB6_15 +; FUNC2-NEXT: ; %bb.7: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v12, v0 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v13, v1 +; FUNC2-NEXT: v_sub_u32_e32 v18, vcc, 0, v0 +; FUNC2-NEXT: v_subb_u32_e32 v19, vcc, 0, v1, vcc +; FUNC2-NEXT: v_mac_f32_e32 v12, 0x4f800000, v13 +; FUNC2-NEXT: v_rcp_f32_e32 v12, v12 +; FUNC2-NEXT: v_mul_f32_e32 v12, 0x5f7ffffc, v12 +; FUNC2-NEXT: v_mul_f32_e32 v13, 0x2f800000, v12 +; FUNC2-NEXT: v_trunc_f32_e32 v13, v13 +; FUNC2-NEXT: v_mac_f32_e32 v12, 0xcf800000, v13 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v16, v13 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v17, v12 +; FUNC2-NEXT: v_mul_lo_u32 v14, v18, v16 +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v18, v17, 0 +; FUNC2-NEXT: v_mul_lo_u32 v15, v19, v17 +; FUNC2-NEXT: v_add_u32_e32 v13, vcc, v13, v14 +; FUNC2-NEXT: v_add_u32_e32 v15, vcc, v13, v15 +; FUNC2-NEXT: v_mul_hi_u32 v20, v17, v12 +; FUNC2-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v17, v15, 0 +; FUNC2-NEXT: v_add_u32_e32 v20, vcc, v20, v13 +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v16, v12, 0 +; FUNC2-NEXT: v_addc_u32_e32 v21, vcc, 0, v14, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v16, v15, 0 +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v20, v12 +; FUNC2-NEXT: v_addc_u32_e32 v12, vcc, v21, v13, vcc +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v12, v14 +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; FUNC2-NEXT: v_add_u32_e32 v20, vcc, v17, v12 +; FUNC2-NEXT: v_addc_u32_e32 v21, vcc, v16, v13, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v18, v20, 0 +; FUNC2-NEXT: v_mul_lo_u32 v16, v18, v21 +; FUNC2-NEXT: v_mul_lo_u32 v17, v19, v20 +; FUNC2-NEXT: v_mul_hi_u32 v18, v20, v12 +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v21, v12, 0 +; FUNC2-NEXT: v_add_u32_e32 v13, vcc, v13, v16 +; FUNC2-NEXT: v_add_u32_e32 v13, vcc, v13, v17 +; FUNC2-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v20, v13, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v21, v13, 0 +; FUNC2-NEXT: v_add_u32_e32 v16, vcc, v18, v16 +; FUNC2-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v16, v14 +; FUNC2-NEXT: v_addc_u32_e32 v14, vcc, v17, v15, vcc +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v14, v12 +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v20, v12 +; FUNC2-NEXT: v_addc_u32_e32 v15, vcc, v21, v13, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v15, 0 +; FUNC2-NEXT: v_mul_hi_u32 v16, v4, v14 +; FUNC2-NEXT: v_add_u32_e32 v16, vcc, v16, v12 +; FUNC2-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v5, v14, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v5, v15, 0 +; FUNC2-NEXT: v_add_u32_e32 v12, vcc, v16, v12 +; FUNC2-NEXT: v_addc_u32_e32 v12, vcc, v17, v13, vcc +; FUNC2-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v12, v14 +; FUNC2-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc +; FUNC2-NEXT: v_mul_lo_u32 v15, v0, v12 +; FUNC2-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v0, v14, 0 +; FUNC2-NEXT: v_mul_lo_u32 v14, v1, v14 +; FUNC2-NEXT: v_add_u32_e32 v13, vcc, v13, v15 +; FUNC2-NEXT: v_add_u32_e32 v13, vcc, v13, v14 +; FUNC2-NEXT: v_sub_u32_e32 v14, vcc, v5, v13 +; FUNC2-NEXT: v_sub_u32_e32 v12, vcc, v4, v12 +; FUNC2-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v1, vcc +; FUNC2-NEXT: v_sub_u32_e64 v15, s[0:1], v12, v0 +; FUNC2-NEXT: v_subbrev_u32_e64 v16, s[2:3], 0, v14, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v16, v1 +; FUNC2-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v0 +; FUNC2-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v16, v1 +; FUNC2-NEXT: v_subb_u32_e64 v14, s[0:1], v14, v1, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v18, s[0:1], v15, v0 +; FUNC2-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v14, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 +; FUNC2-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v12, v0 +; FUNC2-NEXT: v_cndmask_b32_e64 v14, v16, v14, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; FUNC2-NEXT: v_cndmask_b32_e32 v1, v13, v16, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v15, v15, v18, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; FUNC2-NEXT: v_cndmask_b32_e32 v13, v5, v14, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB6_9 +; FUNC2-NEXT: .LBB6_8: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, v0 +; FUNC2-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 +; FUNC2-NEXT: v_mov_b32_e32 v13, 0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v1, v1 +; FUNC2-NEXT: v_mul_lo_u32 v5, v5, v1 +; FUNC2-NEXT: v_mul_hi_u32 v5, v1, v5 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; FUNC2-NEXT: v_mul_hi_u32 v1, v4, v1 +; FUNC2-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; FUNC2-NEXT: v_sub_u32_e32 v4, vcc, v1, v0 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; FUNC2-NEXT: v_sub_u32_e32 v4, vcc, v1, v0 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 +; FUNC2-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc +; FUNC2-NEXT: .LBB6_9: +; FUNC2-NEXT: v_or_b32_e32 v1, v7, v3 +; FUNC2-NEXT: v_mov_b32_e32 v0, 0 +; FUNC2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; FUNC2-NEXT: s_cbranch_vccz .LBB6_16 +; FUNC2-NEXT: ; %bb.10: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC2-NEXT: v_cvt_f32_u32_e32 v1, v3 +; FUNC2-NEXT: v_sub_u32_e32 v16, vcc, 0, v2 +; FUNC2-NEXT: v_subb_u32_e32 v17, vcc, 0, v3, vcc +; FUNC2-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; FUNC2-NEXT: v_rcp_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; FUNC2-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; FUNC2-NEXT: v_trunc_f32_e32 v1, v1 +; FUNC2-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v14, v1 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v15, v0 +; FUNC2-NEXT: v_mul_lo_u32 v4, v16, v14 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v16, v15, 0 +; FUNC2-NEXT: v_mul_lo_u32 v5, v17, v15 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; FUNC2-NEXT: v_add_u32_e32 v19, vcc, v1, v5 +; FUNC2-NEXT: v_mul_hi_u32 v18, v15, v0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v15, v19, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v14, v0, 0 +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v18, v4 +; FUNC2-NEXT: v_addc_u32_e32 v20, vcc, 0, v5, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v19, 0 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v18, v0 +; FUNC2-NEXT: v_addc_u32_e32 v0, vcc, v20, v1, vcc +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FUNC2-NEXT: v_add_u32_e32 v18, vcc, v15, v0 +; FUNC2-NEXT: v_addc_u32_e32 v19, vcc, v14, v1, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v16, v18, 0 +; FUNC2-NEXT: v_mul_lo_u32 v14, v16, v19 +; FUNC2-NEXT: v_mul_lo_u32 v15, v17, v18 +; FUNC2-NEXT: v_mul_hi_u32 v16, v18, v0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v19, v0, 0 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v14 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v15 +; FUNC2-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v18, v1, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v19, v1, 0 +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v16, v14 +; FUNC2-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v14, v4 +; FUNC2-NEXT: v_addc_u32_e32 v4, vcc, v15, v5, vcc +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v18, v0 +; FUNC2-NEXT: v_addc_u32_e32 v5, vcc, v19, v1, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v5, 0 +; FUNC2-NEXT: v_mul_hi_u32 v14, v6, v4 +; FUNC2-NEXT: v_add_u32_e32 v14, vcc, v14, v0 +; FUNC2-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v4, 0 +; FUNC2-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v5, 0 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v14, v0 +; FUNC2-NEXT: v_addc_u32_e32 v0, vcc, v15, v1, vcc +; FUNC2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; FUNC2-NEXT: v_add_u32_e32 v4, vcc, v0, v4 +; FUNC2-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc +; FUNC2-NEXT: v_mul_lo_u32 v5, v2, v0 +; FUNC2-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v4, 0 +; FUNC2-NEXT: v_mul_lo_u32 v4, v3, v4 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; FUNC2-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; FUNC2-NEXT: v_sub_u32_e32 v4, vcc, v7, v1 +; FUNC2-NEXT: v_sub_u32_e32 v0, vcc, v6, v0 +; FUNC2-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, vcc +; FUNC2-NEXT: v_sub_u32_e64 v5, s[0:1], v0, v2 +; FUNC2-NEXT: v_subbrev_u32_e64 v14, s[2:3], 0, v4, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v3 +; FUNC2-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v2 +; FUNC2-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] +; FUNC2-NEXT: v_cmp_eq_u32_e64 s[2:3], v14, v3 +; FUNC2-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v3, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] +; FUNC2-NEXT: v_sub_u32_e64 v16, s[0:1], v5, v2 +; FUNC2-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc +; FUNC2-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; FUNC2-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 +; FUNC2-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e64 v4, v14, v4, s[0:1] +; FUNC2-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc +; FUNC2-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; FUNC2-NEXT: v_cndmask_b32_e32 v3, v7, v14, vcc +; FUNC2-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[0:1] +; FUNC2-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; FUNC2-NEXT: v_cndmask_b32_e32 v15, v1, v4, vcc +; FUNC2-NEXT: v_cndmask_b32_e32 v14, v0, v5, vcc +; FUNC2-NEXT: s_cbranch_execnz .LBB6_12 +; FUNC2-NEXT: .LBB6_11: +; FUNC2-NEXT: v_cvt_f32_u32_e32 v0, v2 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 +; FUNC2-NEXT: v_mov_b32_e32 v15, 0 +; FUNC2-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; FUNC2-NEXT: v_cvt_u32_f32_e32 v0, v0 +; FUNC2-NEXT: v_mul_lo_u32 v1, v1, v0 +; FUNC2-NEXT: v_mul_hi_u32 v1, v0, v1 +; FUNC2-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; FUNC2-NEXT: v_mul_hi_u32 v0, v6, v0 +; FUNC2-NEXT: v_mul_lo_u32 v0, v0, v2 +; FUNC2-NEXT: v_sub_u32_e32 v0, vcc, v6, v0 +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; FUNC2-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 +; FUNC2-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; FUNC2-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc +; FUNC2-NEXT: .LBB6_12: +; FUNC2-NEXT: s_mov_b32 s7, 0xf000 +; FUNC2-NEXT: s_mov_b32 s6, -1 +; FUNC2-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 +; FUNC2-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; FUNC2-NEXT: s_endpgm +; FUNC2-NEXT: .LBB6_13: +; FUNC2-NEXT: ; implicit-def: $vgpr8_vgpr9 +; FUNC2-NEXT: s_branch .LBB6_2 +; FUNC2-NEXT: .LBB6_14: +; FUNC2-NEXT: s_branch .LBB6_5 +; FUNC2-NEXT: .LBB6_15: +; FUNC2-NEXT: ; implicit-def: $vgpr12_vgpr13 +; FUNC2-NEXT: s_branch .LBB6_8 +; FUNC2-NEXT: .LBB6_16: +; FUNC2-NEXT: s_branch .LBB6_11 +; +; EG-LABEL: test_urem_v4i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @26 +; EG-NEXT: ALU 114, @35, KC0[], KC1[] +; EG-NEXT: ALU 115, @150, KC0[], KC1[] +; EG-NEXT: ALU 115, @266, KC0[], KC1[] +; EG-NEXT: ALU 115, @382, KC0[], KC1[] +; EG-NEXT: ALU 22, @498, KC0[], KC1[] +; EG-NEXT: TEX 1 @30 +; EG-NEXT: ALU 112, @521, KC0[], KC1[] +; EG-NEXT: ALU 114, @634, KC0[], KC1[] +; EG-NEXT: ALU 114, @749, KC0[], KC1[] +; EG-NEXT: ALU 113, @864, KC0[], KC1[] +; EG-NEXT: ALU 114, @978, KC0[], KC1[] +; EG-NEXT: ALU 113, @1093, KC0[], KC1[] +; EG-NEXT: ALU 114, @1207, KC0[], KC1[] +; EG-NEXT: ALU 113, @1322, KC0[], KC1[] +; EG-NEXT: ALU 114, @1436, KC0[], KC1[] +; EG-NEXT: ALU 113, @1551, KC0[], KC1[] +; EG-NEXT: ALU 114, @1665, KC0[], KC1[] +; EG-NEXT: ALU 113, @1780, KC0[], KC1[] +; EG-NEXT: ALU 112, @1894, KC0[], KC1[] +; EG-NEXT: ALU 101, @2007, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 26: +; EG-NEXT: VTX_READ_128 T0.XYZW, T2.X, 32, #1 +; EG-NEXT: VTX_READ_128 T1.XYZW, T2.X, 0, #1 +; EG-NEXT: Fetch clause starting at 30: +; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 16, #1 +; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 48, #1 +; EG-NEXT: ALU clause starting at 34: +; EG-NEXT: MOV * T2.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 35: +; EG-NEXT: SUB_INT T2.W, 0.0, T0.X, +; EG-NEXT: RECIP_UINT * T2.Y, T0.X, +; EG-NEXT: MULLO_INT * T2.Z, PV.W, PS, +; EG-NEXT: MULHI * T2.Z, T2.Y, PS, +; EG-NEXT: ADD_INT * T2.W, T2.Y, PS, +; EG-NEXT: MULHI * T2.Y, T1.Y, PV.W, +; EG-NEXT: MULLO_INT * T2.Y, PS, T0.X, +; EG-NEXT: SUB_INT * T2.W, T1.Y, PS, +; EG-NEXT: SETGE_UINT T3.W, PV.W, T0.X, +; EG-NEXT: SUB_INT * T4.W, PV.W, T0.X, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T2.W, PS, +; EG-NEXT: SETGE_UINT T3.W, PV.W, T0.X, +; EG-NEXT: SUB_INT * T4.W, PV.W, T0.X, +; EG-NEXT: CNDE_INT * T2.W, PV.W, T2.W, PS, +; EG-NEXT: CNDE_INT * T2.W, T0.Y, PV.W, T1.Y, +; EG-NEXT: BIT_ALIGN_INT T3.W, PV.W, T1.X, literal.x, +; EG-NEXT: LSHR * T2.W, PV.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T2.Z, PS, T0.Y, +; EG-NEXT: SETGE_UINT T4.W, PS, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.X, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PS, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUB_INT T4.W, T2.W, T0.Y, BS:VEC_120/SCL_212 +; EG-NEXT: SUBB_UINT * T5.W, T3.W, T0.X, +; EG-NEXT: SUB_INT T4.W, PV.W, PS, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 29(4.063766e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT * T2.Z, PS, T0.X, +; EG-NEXT: ALU clause starting at 150: +; EG-NEXT: SETE_INT T4.W, T2.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, T2.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, T2.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 21(2.942727e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT * T2.Z, T3.W, T0.X, +; EG-NEXT: ALU clause starting at 266: +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, T1.Y, T3.W, T2.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: ALU clause starting at 382: +; EG-NEXT: LSHL T2.Z, T3.W, 1, BS:VEC_201 +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, T4.W, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T2.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T3.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 498: +; EG-NEXT: BIT_ALIGN_INT T2.W, T2.W, T3.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T3.W, T2.Z, T5.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T2.Z, PS, T0.X, +; EG-NEXT: SETE_INT T4.W, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.W, T0.Y, +; EG-NEXT: CNDE_INT T1.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T2.Z, T3.W, T0.X, +; EG-NEXT: SUBB_UINT T4.W, T3.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T2.W, T0.Y, +; EG-NEXT: SUB_INT T4.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T3.W, PV.Y, T3.W, PV.Z, +; EG-NEXT: LSHL T2.Z, PS, 1, +; EG-NEXT: BFE_UINT T5.W, T1.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BIT_ALIGN_INT T3.W, PS, T3.W, literal.x, +; EG-NEXT: OR_INT * T5.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T1.Y, PS, T0.X, +; EG-NEXT: SETE_INT T3.Z, PV.W, T0.Y, +; EG-NEXT: SUB_INT T6.W, 0.0, T0.Z, +; EG-NEXT: RECIP_UINT * T3.X, T0.Z, +; EG-NEXT: ALU clause starting at 521: +; EG-NEXT: SETGE_UINT T7.W, T3.W, T0.Y, +; EG-NEXT: MULLO_INT * T3.Y, T6.W, T3.X, +; EG-NEXT: CNDE_INT T5.X, T3.Z, PV.W, T1.Y, +; EG-NEXT: SUB_INT T1.Y, T5.W, T0.X, +; EG-NEXT: SUBB_UINT T3.Z, T5.W, T0.X, +; EG-NEXT: SUB_INT T6.W, T3.W, T0.Y, BS:VEC_102/SCL_221 +; EG-NEXT: MULHI * T3.Y, T3.X, PS, +; EG-NEXT: ADD_INT T3.Y, T3.X, PS, +; EG-NEXT: SUB_INT T3.Z, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T5.W, PV.X, T5.W, PV.Y, +; EG-NEXT: RECIP_UINT * T1.Y, T2.Z, +; EG-NEXT: LSHL T5.Y, PV.W, 1, +; EG-NEXT: BFE_UINT T5.Z, T1.X, 1, 1, +; EG-NEXT: CNDE_INT T3.W, T5.X, T3.W, PV.Z, BS:VEC_102/SCL_221 +; EG-NEXT: MULHI * T3.X, T1.W, PV.Y, +; EG-NEXT: SUB_INT T3.Y, 0.0, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.x, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: MULLO_INT * T3.X, PS, T0.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T3.X, T1.W, PS, +; EG-NEXT: SETGE_UINT T5.Y, PV.W, T0.X, +; EG-NEXT: SETE_INT T5.Z, PV.Z, T0.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.W, PV.Z, T0.Y, BS:VEC_021/SCL_122 +; EG-NEXT: MULLO_INT * T3.Y, PV.Y, T1.Y, +; EG-NEXT: CNDE_INT T5.X, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T5.Y, T5.W, T0.X, +; EG-NEXT: SETGE_UINT T5.Z, PV.X, T0.Z, +; EG-NEXT: SUB_INT T3.W, PV.X, T0.Z, +; EG-NEXT: MULHI * T3.Y, T1.Y, PS, +; EG-NEXT: ADD_INT T6.Z, T1.Y, PS, +; EG-NEXT: CNDE_INT T3.W, PV.Z, T3.X, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T3.X, PS, 1, +; EG-NEXT: AND_INT T1.Y, T1.X, 1, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T0.Z, +; EG-NEXT: SUB_INT T7.W, PV.W, T0.Z, +; EG-NEXT: MULHI * T1.X, T4.W, PV.Z, +; EG-NEXT: CNDE_INT T5.Z, PV.Z, T3.W, PV.W, +; EG-NEXT: OR_INT T3.W, PV.X, PV.Y, +; EG-NEXT: MULLO_INT * T1.Y, PS, T2.Z, +; EG-NEXT: SETGE_UINT T1.X, PV.W, T0.X, +; EG-NEXT: SUB_INT T3.Y, 0.0, T2.X, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T5.Z, T0.W, PV.Z, T1.W, +; EG-NEXT: SUB_INT T1.W, T4.W, PS, BS:VEC_120/SCL_212 +; EG-NEXT: RECIP_UINT * T1.Y, T2.X, +; EG-NEXT: SETGE_UINT T3.X, PV.W, T2.Z, +; EG-NEXT: SUB_INT T5.Y, PV.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.Z, T1.Z, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: LSHR T7.W, PV.Z, literal.x, +; EG-NEXT: MULLO_INT * T3.Y, PV.Y, PS, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T6.X, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T6.Y, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T5.Z, PV.Z, T0.Z, +; EG-NEXT: CNDE_INT T8.W, PV.X, T1.W, PV.Y, BS:VEC_021/SCL_122 +; EG-NEXT: MULHI * T1.W, T1.Y, PS, +; EG-NEXT: SETGE_UINT T7.X, PV.W, T2.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.X, PV.Y, PV.Z, +; EG-NEXT: ADD_INT T5.Z, T1.Y, PS, +; EG-NEXT: SUBB_UINT T1.W, T5.W, T0.X, +; EG-NEXT: SUB_INT * T5.W, T3.Z, T0.Y, +; EG-NEXT: SUB_INT T3.X, T6.Z, T0.Z, +; EG-NEXT: SUB_INT T1.Y, T7.W, T0.W, +; EG-NEXT: SUBB_UINT T7.Z, T6.Z, T0.Z, +; EG-NEXT: SUB_INT T1.W, PS, PV.W, +; EG-NEXT: MULHI * T5.Y, T4.Y, PV.Z, +; EG-NEXT: SUB_INT T6.X, T8.W, T2.Z, +; EG-NEXT: CNDE_INT T6.Y, T5.X, T3.Z, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T3.Z, PV.Y, PV.Z, +; EG-NEXT: CNDE_INT T5.W, T3.Y, T6.Z, PV.X, BS:VEC_102/SCL_221 +; EG-NEXT: MULLO_INT * T1.Y, PS, T2.X, +; EG-NEXT: LSHL T5.X, PV.W, 1, +; EG-NEXT: BFE_UINT T5.Y, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Z, T3.Y, T7.W, PV.Z, +; EG-NEXT: BIT_ALIGN_INT T1.W, PV.Y, T6.W, literal.y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT * T6.W, T4.Y, PS, +; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) +; EG-NEXT: SETE_INT T3.X, PV.W, T0.Y, +; EG-NEXT: SETGE_UINT T1.Y, PS, T2.X, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.Z, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, PV.X, PV.Y, +; EG-NEXT: CNDE_INT * T7.W, T7.X, T8.W, T6.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T5.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T3.Y, T2.W, PS, T4.W, BS:VEC_102/SCL_221 +; EG-NEXT: SETGE_UINT * T5.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T4.W, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT * T7.W, T3.Z, T0.W, +; EG-NEXT: CNDE_INT T6.X, PV.W, PS, T5.Z, +; EG-NEXT: SUB_INT T5.Y, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T5.Z, T3.Y, T4.Z, literal.x, BS:VEC_102/SCL_221 +; EG-NEXT: LSHR T4.W, T3.Y, literal.x, +; EG-NEXT: CNDE_INT * T6.W, T1.Y, T6.W, T5.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T7.X, PS, T2.X, +; EG-NEXT: SUB_INT T1.Y, PS, T2.X, +; EG-NEXT: SETE_INT T6.Z, PV.W, T2.W, +; EG-NEXT: SETGE_UINT T7.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T2.Z, +; EG-NEXT: SETGE_UINT T5.X, T1.W, T0.Y, +; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, PS, +; EG-NEXT: SUB_INT T6.Z, T5.Z, T2.Z, +; EG-NEXT: CNDE_INT T6.W, PV.X, T6.W, PV.Y, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, T6.X, T5.W, T5.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: SUB_INT T1.Y, T4.W, T2.W, +; EG-NEXT: SUBB_UINT T7.Z, T5.Z, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T6.W, T2.Y, PV.W, T4.Y, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T5.Z, PV.Z, +; EG-NEXT: BFE_UINT T8.X, T1.Z, literal.x, 1, +; EG-NEXT: LSHL * T4.Y, PS, 1, +; EG-NEXT: 29(4.063766e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 634: +; EG-NEXT: BIT_ALIGN_INT T5.Z, T6.W, T4.X, literal.x, +; EG-NEXT: LSHR T6.W, T6.W, literal.x, +; EG-NEXT: SUB_INT * T9.W, T1.Y, T7.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, +; EG-NEXT: SETE_INT T6.Z, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT T4.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.X, +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T10.X, T5.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, PS, +; EG-NEXT: SUB_INT T6.Z, T5.Z, T2.X, +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, +; EG-NEXT: OR_INT * T5.W, T4.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T9.X, T6.W, T2.Y, +; EG-NEXT: SUBB_UINT T1.Y, T5.Z, T2.X, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T11.X, T5.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T5.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T8.W, PV.X, PV.Y, +; EG-NEXT: CNDE_INT * T9.W, T3.Y, T5.Z, T6.Z, +; EG-NEXT: SUB_INT T9.X, T4.W, T2.W, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T6.W, T3.Y, T6.W, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T7.Z, +; EG-NEXT: SUB_INT T12.X, T3.Z, T0.W, +; EG-NEXT: LSHL T3.Y, PV.W, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, T6.W, T9.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T6.W, T1.Y, T5.Z, +; EG-NEXT: SUB_INT * T8.W, T9.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T4.Y, T4.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T4.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T2.Y, +; EG-NEXT: 29(4.063766e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T4.Y, T6.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.Y, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, T3.Y, PV.X, +; EG-NEXT: SUB_INT * T5.W, T12.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T6.X, T3.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T5.W, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T8.W, PV.X, T6.W, PV.Y, +; EG-NEXT: LSHL T9.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T3.Z, T4.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.X, T7.W, literal.x, +; EG-NEXT: OR_INT * T7.W, T7.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.Z, +; EG-NEXT: SUB_INT T3.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T9.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T0.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T8.X, T4.X, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 29(4.063766e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T5.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T5.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.W, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Z, T4.W, T2.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.Z, +; EG-NEXT: SUB_INT T1.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T7.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T0.Z, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T6.Z, T2.Y, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT * T5.Z, PV.W, T4.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 749: +; EG-NEXT: OR_INT T4.W, T1.Y, T3.Z, +; EG-NEXT: SUB_INT * T7.W, T6.X, T10.X, +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T5.W, T5.Z, T2.W, BS:VEC_201 +; EG-NEXT: SETGE_UINT * T7.W, T5.Z, T2.W, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T6.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T6.Z, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T6.Z, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, +; EG-NEXT: OR_INT * T8.W, T9.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T0.Z, +; EG-NEXT: SUB_INT T3.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T2.X, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.Y, +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 26(3.643376e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T0.W, +; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T6.Z, T5.W, T0.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T0.Z, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.X, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T2.Y, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T8.W, T4.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T5.Z, T2.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, PV.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T0.W, +; EG-NEXT: 27(3.783506e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T4.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T5.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.Z, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T5.Z, T4.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.X, +; EG-NEXT: SUB_INT T3.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T2.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T5.Z, +; EG-NEXT: SUB_INT * T6.X, T6.W, T2.W, +; EG-NEXT: ALU clause starting at 864: +; EG-NEXT: LSHL T3.Y, T4.W, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T6.Z, T9.W, +; EG-NEXT: CNDE_INT * T7.W, T4.Y, T7.W, T7.Z, +; EG-NEXT: 26(3.643376e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T1.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, T6.X, T7.X, +; EG-NEXT: 24(3.363116e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.Y, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T5.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T5.Z, T4.W, T2.X, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.X, +; EG-NEXT: SUB_INT T1.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T7.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T2.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.Z, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 25(3.503246e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T3.Z, T0.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T2.Y, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.Y, T6.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T3.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T3.Z, T5.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.Z, +; EG-NEXT: SUB_INT T3.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T9.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T0.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.X, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 24(3.363116e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Z, T5.W, T2.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.Z, +; EG-NEXT: SUB_INT T1.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T8.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T0.Z, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T0.W, +; EG-NEXT: LSHL * T1.Y, PS, 1, +; EG-NEXT: ALU clause starting at 978: +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT T8.W, T4.Y, T5.Z, T8.W, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT * T6.W, T3.Y, T6.W, T7.Z, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T6.Z, T2.Y, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T5.W, literal.x, +; EG-NEXT: OR_INT T5.W, T1.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, T6.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T4.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T2.W, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T4.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T6.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T6.Z, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T6.Z, T4.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T7.W, literal.x, +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T0.Z, +; EG-NEXT: SUB_INT T3.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T2.X, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.Y, +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 21(2.942727e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 21(2.942727e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T0.W, +; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T5.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T6.Z, T4.W, T0.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T0.Z, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T7.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.X, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T2.Y, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T5.Z, T2.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T5.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T0.W, +; EG-NEXT: 22(3.082857e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T5.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.Z, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T5.Z, T5.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.X, +; EG-NEXT: SUB_INT T3.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T7.Z, PS, T2.Z, +; EG-NEXT: ALU clause starting at 1093: +; EG-NEXT: SETE_INT T9.W, T6.W, T2.W, +; EG-NEXT: SETGE_UINT * T10.W, T6.W, T2.W, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, T7.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T2.Z, +; EG-NEXT: SUB_INT T9.W, T3.Y, T6.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 21(2.942727e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T1.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 19(2.662467e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.Y, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T5.Z, T5.W, T2.X, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.X, +; EG-NEXT: SUB_INT T1.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T2.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.Z, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T8.W, T4.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T3.Z, T0.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, PV.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T4.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T2.Y, +; EG-NEXT: 19(2.662467e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.Y, T6.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T3.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T3.Z, T4.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T7.W, literal.x, +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.Z, +; EG-NEXT: SUB_INT T3.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T9.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T0.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.X, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 19(2.662467e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.W, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T5.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Z, T4.W, T2.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.Z, +; EG-NEXT: SUB_INT T1.Y, T5.Z, T2.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT * T7.W, PV.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: ALU clause starting at 1207: +; EG-NEXT: SETGE_UINT * T9.W, T5.W, T0.W, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T3.Y, T7.W, PV.W, T7.Z, BS:VEC_201 +; EG-NEXT: SUB_INT T7.Z, T6.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T7.W, T1.Y, T6.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T6.Z, T2.Y, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T4.W, literal.x, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T5.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T2.W, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T6.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T6.Z, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T6.Z, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T0.Z, +; EG-NEXT: SUB_INT T3.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T2.X, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.Y, +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T0.W, +; EG-NEXT: 18(2.522337e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T6.Z, T5.W, T0.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T0.Z, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.X, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T2.Y, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T8.W, T4.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T5.Z, T2.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, PV.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T4.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T0.W, +; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T4.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T5.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.Z, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL * T8.X, PS, 1, +; EG-NEXT: ALU clause starting at 1322: +; EG-NEXT: CNDE_INT T1.Y, T5.Z, T6.W, T1.Y, +; EG-NEXT: SUB_INT T5.Z, T4.W, T2.X, +; EG-NEXT: BIT_ALIGN_INT * T6.W, T6.X, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.X, +; EG-NEXT: SUB_INT T3.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T9.W, T6.W, T2.W, BS:VEC_210 +; EG-NEXT: SETGE_UINT * T10.W, T6.W, T2.W, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T2.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T1.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 14(1.961818e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.Y, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T5.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T5.Z, T4.W, T2.X, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.X, +; EG-NEXT: SUB_INT T1.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T7.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T2.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.Z, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 15(2.101948e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T3.Z, T0.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T2.Y, +; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.Y, T6.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T3.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T3.Z, T5.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.Z, +; EG-NEXT: SUB_INT T3.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T9.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T0.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.X, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 14(1.961818e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.W, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T2.X, +; EG-NEXT: CNDE_INT * T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: ALU clause starting at 1436: +; EG-NEXT: SUB_INT T3.Z, T5.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T4.W, T3.Y, T8.W, literal.x, +; EG-NEXT: OR_INT * T6.W, T1.Y, T6.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.Z, +; EG-NEXT: SUB_INT T1.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T8.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T0.Z, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T8.W, T4.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T6.Z, T2.Y, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T5.W, literal.x, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, PV.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T4.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T2.W, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T4.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T6.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T6.Z, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T6.Z, T4.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T7.W, literal.x, +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T0.Z, +; EG-NEXT: SUB_INT T3.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T2.X, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.Y, +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 11(1.541428e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T0.W, +; EG-NEXT: 13(1.821688e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T5.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T6.Z, T4.W, T0.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T0.Z, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T7.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.X, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T2.Y, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T5.Z, T2.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T5.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T0.W, +; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT * T6.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 1551: +; EG-NEXT: OR_INT T5.W, T4.Y, T6.X, BS:VEC_102/SCL_221 +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: CNDE_INT T6.X, T11.X, T5.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.Z, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T6.W, T6.Z, T2.Y, +; EG-NEXT: CNDE_INT * T7.W, T10.X, T4.W, T3.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T5.Z, T5.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.X, +; EG-NEXT: SUB_INT T3.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T2.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 11(1.541428e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T1.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 9(1.261169e-44), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.Y, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T5.Z, T5.W, T2.X, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T2.X, +; EG-NEXT: SUB_INT T1.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T2.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.Z, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T8.W, T4.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 10(1.401298e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T3.Z, T0.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, PV.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T4.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T8.W, PV.Z, T2.Y, +; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.Y, T6.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T3.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T3.Z, T4.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T7.W, literal.x, +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.Z, +; EG-NEXT: SUB_INT T3.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T9.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T0.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.X, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 9(1.261169e-44), 31(4.344025e-44) +; EG-NEXT: ALU clause starting at 1665: +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, T9.W, +; EG-NEXT: SETGE_UINT * T3.Z, T4.W, T2.Z, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T6.W, T5.Z, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, T5.Z, T2.W, +; EG-NEXT: SUBB_UINT T7.X, T5.W, T2.X, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, T3.Z, +; EG-NEXT: SUB_INT T3.Z, T4.W, T2.Z, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.W, T3.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, T6.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.Z, +; EG-NEXT: SUB_INT T1.Y, T5.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T7.Z, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T7.W, PV.W, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T0.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T0.Z, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T3.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T3.Z, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T5.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T6.Z, T2.Y, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.W, T4.W, literal.x, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.Z, +; EG-NEXT: SETE_INT T5.W, PV.Z, T2.W, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T2.W, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T6.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T6.Z, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T6.Z, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.x, +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T0.Z, +; EG-NEXT: SUB_INT T3.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T8.X, T8.W, T2.X, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T8.W, T2.X, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T1.Y, T5.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.Y, +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T8.W, PV.Y, T8.W, PV.Z, +; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T4.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.y, +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T8.X, +; EG-NEXT: 6(8.407791e-45), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.Z, PV.W, T0.Z, +; EG-NEXT: SETE_INT T6.W, PV.Z, T0.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T0.W, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T4.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T6.Z, T5.W, T0.Z, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Y, T8.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T5.W, T0.Z, +; EG-NEXT: SUB_INT T1.Y, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.X, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.Y, +; EG-NEXT: SUBB_UINT T11.X, T6.W, T2.X, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.X, +; EG-NEXT: SUB_INT T8.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, T6.Z, +; EG-NEXT: SUB_INT T6.X, T4.W, T2.Y, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T6.Z, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T8.W, T4.Y, T3.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T5.Z, T2.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.W, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T8.W, PV.X, T11.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T6.X, T4.X, literal.x, 1, +; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 1780: +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T4.W, T8.W, +; EG-NEXT: SETGE_UINT * T6.Z, T5.W, T0.Z, +; EG-NEXT: SETE_INT T4.W, T3.Z, T0.W, +; EG-NEXT: SETGE_UINT * T8.W, T3.Z, T0.W, +; EG-NEXT: CNDE_INT T11.X, PV.W, PS, T6.Z, +; EG-NEXT: SUB_INT T3.Y, T5.W, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T6.Z, T1.Y, T6.W, literal.x, +; EG-NEXT: OR_INT T4.W, T4.Y, T6.X, BS:VEC_102/SCL_221 +; EG-NEXT: SUB_INT * T6.W, T12.X, T8.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T10.X, T5.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.Z, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT * T8.W, PV.X, T5.W, PV.Y, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: SUB_INT T5.Z, T4.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T7.W, T7.X, T9.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.X, +; EG-NEXT: SUB_INT T3.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T9.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T10.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T7.X, T7.W, T2.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T7.W, T2.Z, +; EG-NEXT: SUB_INT T9.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T1.Y, T4.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T6.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T9.W, T1.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T7.W, PV.Z, +; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T9.X, T1.Z, literal.x, 1, +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T4.W, literal.y, +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T9.W, PV.X, T7.X, +; EG-NEXT: 4(5.605194e-45), 31(4.344025e-44) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Y, T4.Y, T6.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.Z, T2.Y, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T7.X, T5.W, T0.Z, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T5.Z, T4.W, T2.X, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Y, T7.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T6.W, T1.Y, PV.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T4.W, T2.X, +; EG-NEXT: SUB_INT T1.Y, T6.Z, T2.Y, +; EG-NEXT: SETGE_UINT T7.Z, PS, T2.Z, +; EG-NEXT: SETE_INT T7.W, PV.W, T2.W, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.W, +; EG-NEXT: SUBB_UINT T10.X, T6.W, T2.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T7.Z, T6.W, T2.Z, +; EG-NEXT: SUB_INT T7.W, PV.Y, PV.X, +; EG-NEXT: CNDE_INT * T4.W, T4.Y, T4.W, T5.Z, +; EG-NEXT: SUB_INT T6.X, T5.W, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T1.Y, PS, 1, +; EG-NEXT: BFE_UINT T5.Z, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T7.W, T4.Y, T6.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.Y, T6.W, PV.Z, +; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) +; EG-NEXT: SUB_INT T12.X, T3.Z, T0.W, +; EG-NEXT: LSHL T4.Y, PS, 1, +; EG-NEXT: BIT_ALIGN_INT T6.Z, PV.W, T4.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T4.W, PV.Y, PV.Z, +; EG-NEXT: SUB_INT * T7.W, PV.X, T10.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T6.X, T4.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T1.Y, T3.Y, T5.W, PS, +; EG-NEXT: SETGE_UINT T5.Z, PV.W, T2.X, +; EG-NEXT: SETE_INT T5.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T2.Y, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Y, T4.W, T2.X, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T5.Z, PV.Y, T6.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT T5.W, T4.Y, PV.X, +; EG-NEXT: SUB_INT * T6.W, T12.X, T7.X, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T6.X, T11.X, T3.Z, PS, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T6.W, PV.Z, T2.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T7.W, PV.X, T4.W, PV.Y, +; EG-NEXT: LSHL T7.X, PS, 1, +; EG-NEXT: BFE_UINT T3.Y, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Z, PV.Z, PV.W, PV.Y, +; EG-NEXT: BIT_ALIGN_INT T6.W, PV.X, T8.W, literal.y, +; EG-NEXT: OR_INT * T8.W, T8.X, T9.X, +; EG-NEXT: 4(5.605194e-45), 31(4.344025e-44) +; EG-NEXT: SETGE_UINT T6.X, PS, T0.Z, +; EG-NEXT: SETE_INT T1.Y, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T7.Z, PV.W, T0.W, +; EG-NEXT: SUBB_UINT T9.W, PS, T0.Z, +; EG-NEXT: SUB_INT * T10.W, PV.W, T0.W, +; EG-NEXT: SUB_INT T8.X, T5.W, T2.Z, +; EG-NEXT: SUBB_UINT T4.Y, T5.W, T2.Z, +; EG-NEXT: SUB_INT T8.Z, T5.Z, T2.W, +; EG-NEXT: SUB_INT T9.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T10.W, PV.Y, PV.Z, PV.X, +; EG-NEXT: CNDE_INT T6.X, PS, T6.W, PV.W, +; EG-NEXT: SUBB_UINT T1.Y, T4.W, T2.X, +; EG-NEXT: SUB_INT T7.Z, PV.Z, PV.Y, +; EG-NEXT: CNDE_INT * T4.W, T3.Z, T5.W, PV.X, BS:VEC_021/SCL_122 +; EG-NEXT: ALU clause starting at 1894: +; EG-NEXT: SUB_INT * T5.W, T8.W, T0.Z, +; EG-NEXT: SUB_INT T8.X, T6.Z, T2.Y, +; EG-NEXT: CNDE_INT T4.Y, T10.W, T8.W, PV.W, +; EG-NEXT: LSHL T8.Z, T4.W, 1, BS:VEC_201 +; EG-NEXT: BFE_UINT * T5.W, T4.Z, literal.x, 1, BS:VEC_120/SCL_212 +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T6.W, T3.Z, T5.Z, T7.Z, +; EG-NEXT: BIT_ALIGN_INT T9.X, PV.W, T4.W, literal.x, +; EG-NEXT: OR_INT T5.Y, T8.Z, T5.W, BS:VEC_021/SCL_122 +; EG-NEXT: LSHL T3.Z, T4.Y, 1, +; EG-NEXT: BFE_UINT T4.W, T1.Z, literal.y, 1, BS:VEC_120/SCL_212 +; EG-NEXT: SUB_INT * T5.W, T8.X, T1.Y, +; EG-NEXT: 31(4.344025e-44), 3(4.203895e-45) +; EG-NEXT: CNDE_INT T8.X, T10.X, T6.Z, PS, +; EG-NEXT: OR_INT T1.Y, PV.Z, PV.W, +; EG-NEXT: SETGE_UINT T3.Z, PV.Y, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T4.W, PV.X, T2.W, +; EG-NEXT: SETGE_UINT * T5.W, PV.X, T2.W, +; EG-NEXT: CNDE_INT T10.X, PV.W, PS, PV.Z, +; EG-NEXT: SETGE_UINT T6.Y, PV.Y, T0.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Z, PV.X, T7.W, literal.x, +; EG-NEXT: BIT_ALIGN_INT T4.W, T6.X, T4.Y, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T5.W, T7.X, T3.Y, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T6.X, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T3.Y, PV.W, T0.W, +; EG-NEXT: SETGE_UINT T5.Z, PS, T2.X, +; EG-NEXT: SETE_INT T6.W, PV.Z, T2.Y, +; EG-NEXT: SETGE_UINT * T7.W, PV.Z, T2.Y, +; EG-NEXT: CNDE_INT T7.X, PV.W, PS, PV.Z, +; EG-NEXT: CNDE_INT T3.Y, PV.X, PV.Y, T6.Y, +; EG-NEXT: SUB_INT T5.Z, T1.Y, T0.Z, +; EG-NEXT: SUBB_UINT T6.W, T1.Y, T0.Z, +; EG-NEXT: SUB_INT * T7.W, T4.W, T0.W, +; EG-NEXT: SUB_INT T6.X, T5.W, T2.X, +; EG-NEXT: SUBB_UINT T4.Y, T5.W, T2.X, +; EG-NEXT: SUB_INT T6.Z, T3.Z, T2.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T6.W, PS, PV.W, +; EG-NEXT: CNDE_INT * T7.W, PV.Y, T1.Y, PV.Z, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: BFE_UINT T1.Y, T1.Z, literal.x, 1, +; EG-NEXT: CNDE_INT T5.Z, T3.Y, T4.W, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT T4.W, PV.Z, PV.Y, +; EG-NEXT: CNDE_INT * T5.W, T7.X, T5.W, PV.X, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T6.X, PS, 1, +; EG-NEXT: BFE_UINT T3.Y, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT T3.Z, T7.X, T3.Z, PV.W, BS:VEC_120/SCL_212 +; EG-NEXT: BIT_ALIGN_INT T4.W, PV.Z, T7.W, literal.y, +; EG-NEXT: OR_INT * T6.W, PV.X, PV.Y, +; EG-NEXT: 3(4.203895e-45), 31(4.344025e-44) +; EG-NEXT: SUB_INT T7.X, T5.Y, T2.Z, +; EG-NEXT: SETGE_UINT T1.Y, PS, T0.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T5.Z, PV.W, T0.W, +; EG-NEXT: BIT_ALIGN_INT T5.W, PV.Z, T5.W, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T7.W, PV.X, PV.Y, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T6.X, T4.W, T0.W, +; EG-NEXT: SUBB_UINT T3.Y, T6.W, T0.Z, BS:VEC_201 +; EG-NEXT: SETGE_UINT T3.Z, PS, T2.X, +; EG-NEXT: SETE_INT T8.W, PV.W, T2.Y, +; EG-NEXT: SETGE_UINT * T9.W, PV.W, T2.Y, +; EG-NEXT: SUB_INT T8.X, T4.W, T0.W, +; EG-NEXT: CNDE_INT T4.Y, PV.W, PS, PV.Z, +; EG-NEXT: SUB_INT T3.Z, T7.W, T2.X, BS:VEC_201 +; EG-NEXT: SUBB_UINT * T8.W, T7.W, T2.X, BS:VEC_201 +; EG-NEXT: SUB_INT * T9.W, T5.W, T2.Y, +; EG-NEXT: SUB_INT T11.X, PV.W, T8.W, +; EG-NEXT: CNDE_INT T6.Y, T4.Y, T7.W, T3.Z, BS:VEC_120/SCL_212 +; EG-NEXT: SUB_INT T3.Z, T6.W, T0.Z, +; EG-NEXT: SUB_INT T7.W, T8.X, T3.Y, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T8.W, T5.Z, T6.X, T1.Y, +; EG-NEXT: CNDE_INT T6.X, PS, T4.W, PV.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT T1.Y, PS, T6.W, PV.Z, BS:VEC_102/SCL_221 +; EG-NEXT: LSHL T3.Z, PV.Y, 1, +; EG-NEXT: BFE_UINT T4.W, T4.X, literal.x, 1, +; EG-NEXT: CNDE_INT * T5.W, T4.Y, T5.W, PV.X, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T8.X, T5.Y, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T3.Y, PS, T6.Y, literal.x, +; EG-NEXT: OR_INT T3.Z, PV.Z, PV.W, +; EG-NEXT: BFE_UINT T4.W, T1.Z, 1, 1, +; EG-NEXT: LSHL * T5.W, PV.Y, 1, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT T11.X, T9.X, T2.W, +; EG-NEXT: OR_INT T4.Y, PS, PV.W, +; EG-NEXT: SETGE_UINT T5.Z, PV.Z, T2.X, +; EG-NEXT: SETE_INT T4.W, PV.Y, T2.Y, +; EG-NEXT: SETGE_UINT * T5.W, PV.Y, T2.Y, +; EG-NEXT: CNDE_INT T12.X, PV.W, PS, PV.Z, +; EG-NEXT: SETGE_UINT T6.Y, PV.Y, T0.Z, +; EG-NEXT: SUB_INT T5.Z, PV.X, T8.X, +; EG-NEXT: BIT_ALIGN_INT * T4.W, T6.X, T1.Y, literal.x, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: CNDE_INT * T5.W, T10.X, T5.Y, T7.X, +; EG-NEXT: SETE_INT T6.X, T4.W, T0.W, +; EG-NEXT: SETGE_UINT T1.Y, T4.W, T0.W, +; EG-NEXT: LSHL T6.Z, PV.W, 1, +; EG-NEXT: BFE_UINT T6.W, T4.Z, literal.x, 1, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT * T7.W, T10.X, T9.X, T5.Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: SUB_INT T7.X, T3.Z, T2.X, +; EG-NEXT: BIT_ALIGN_INT T5.Y, PS, T5.W, literal.x, +; EG-NEXT: OR_INT T5.Z, PV.Z, PV.W, +; EG-NEXT: CNDE_INT T5.W, PV.X, PV.Y, T6.Y, BS:VEC_021/SCL_122 +; EG-NEXT: SUB_INT * T6.W, T4.Y, T0.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUBB_UINT T6.X, T3.Z, T2.X, +; EG-NEXT: SUB_INT T1.Y, T3.Y, T2.Y, +; EG-NEXT: CNDE_INT T6.Z, PV.W, T4.Y, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SUBB_UINT T6.W, PV.Z, T2.Z, +; EG-NEXT: SUB_INT * T7.W, PV.Y, T2.W, +; EG-NEXT: SUB_INT * T8.X, PS, PV.W, +; EG-NEXT: ALU clause starting at 2007: +; EG-NEXT: LSHL T6.Y, T6.Z, 1, +; EG-NEXT: SETGE_UINT T7.Z, T5.Z, T2.Z, BS:VEC_120/SCL_212 +; EG-NEXT: SUB_INT * T6.W, T1.Y, T6.X, +; EG-NEXT: CNDE_INT * T7.W, T12.X, T3.Z, T7.X, +; EG-NEXT: SETE_INT T6.X, T5.Y, T2.W, +; EG-NEXT: SETGE_UINT T1.Y, T5.Y, T2.W, +; EG-NEXT: LSHL T3.Z, PV.W, 1, +; EG-NEXT: BFE_UINT T8.W, T4.X, 1, 1, +; EG-NEXT: CNDE_INT * T6.W, T12.X, T3.Y, T6.W, +; EG-NEXT: AND_INT T7.X, T1.Z, 1, +; EG-NEXT: BIT_ALIGN_INT T3.Y, PS, T7.W, literal.x, +; EG-NEXT: OR_INT T1.Z, PV.Z, PV.W, +; EG-NEXT: CNDE_INT * T6.W, PV.X, PV.Y, T7.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SUB_INT * T7.W, T5.Z, T2.Z, +; EG-NEXT: CNDE_INT T6.X, T6.W, T5.Z, PV.W, +; EG-NEXT: SETGE_UINT T1.Y, T1.Z, T2.X, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Z, T3.Y, T2.Y, +; EG-NEXT: SETGE_UINT T7.W, T3.Y, T2.Y, +; EG-NEXT: OR_INT * T8.W, T6.Y, T7.X, +; EG-NEXT: SUB_INT T7.X, PS, T0.Z, +; EG-NEXT: CNDE_INT T1.Y, PV.Z, PV.W, PV.Y, +; EG-NEXT: LSHL T3.Z, PV.X, 1, +; EG-NEXT: BFE_UINT T7.W, T4.Z, 1, 1, +; EG-NEXT: CNDE_INT * T6.W, T6.W, T5.Y, T8.X, +; EG-NEXT: SUB_INT T8.X, T1.Z, T2.X, +; EG-NEXT: SUBB_UINT T5.Y, T1.Z, T2.X, +; EG-NEXT: SUB_INT T5.Z, T3.Y, T2.Y, +; EG-NEXT: BIT_ALIGN_INT T6.W, PS, T6.X, literal.x, BS:VEC_021/SCL_122 +; EG-NEXT: OR_INT * T7.W, PV.Z, PV.W, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T6.X, PS, T2.Z, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T6.Y, PV.W, T2.W, +; EG-NEXT: SETGE_UINT T3.Z, PV.W, T2.W, +; EG-NEXT: SUB_INT T9.W, PV.Z, PV.Y, +; EG-NEXT: CNDE_INT * T10.W, T1.Y, T1.Z, PV.X, +; EG-NEXT: LSHL T8.X, PS, 1, +; EG-NEXT: AND_INT T5.Y, T4.X, 1, +; EG-NEXT: CNDE_INT T1.Z, T1.Y, T3.Y, PV.W, +; EG-NEXT: CNDE_INT T9.W, PV.Y, PV.Z, PV.X, +; EG-NEXT: SUB_INT * T11.W, T7.W, T2.Z, +; EG-NEXT: SUBB_UINT T4.X, T7.W, T2.Z, +; EG-NEXT: SUB_INT T1.Y, T6.W, T2.W, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT * T3.Z, PV.W, T7.W, PS, BS:VEC_102/SCL_221 +; EG-NEXT: BIT_ALIGN_INT T7.W, T1.Z, T10.W, literal.x, +; EG-NEXT: OR_INT * T10.W, T8.X, T5.Y, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETGE_UINT T6.X, PS, T2.X, BS:VEC_021/SCL_122 +; EG-NEXT: SETE_INT T3.Y, PV.W, T2.Y, +; EG-NEXT: LSHL T1.Z, T3.Z, 1, +; EG-NEXT: AND_INT T11.W, T4.Z, 1, BS:VEC_120/SCL_212 +; EG-NEXT: SUB_INT * T12.W, T1.Y, T4.X, +; EG-NEXT: SETGE_UINT T4.X, T7.W, T2.Y, +; EG-NEXT: SUBB_UINT T1.Y, T4.Y, T0.Z, +; EG-NEXT: SUB_INT * T4.Z, T4.W, T0.W, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT * T6.W, T9.W, T6.W, T12.W, +; EG-NEXT: OR_INT * T9.W, T1.Z, T11.W, +; EG-NEXT: SUBB_UINT T8.X, T10.W, T2.X, +; EG-NEXT: SUB_INT T2.Y, T7.W, T2.Y, BS:VEC_120/SCL_212 +; EG-NEXT: SETGE_UINT T1.Z, PV.W, T2.Z, +; EG-NEXT: BIT_ALIGN_INT T6.W, T6.W, T3.Z, literal.x, BS:VEC_201 +; EG-NEXT: SUB_INT * T11.W, T4.Z, T1.Y, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T9.X, PV.W, T2.W, +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T2.W, +; EG-NEXT: CNDE_INT * T3.Z, T5.W, T4.W, PS, BS:VEC_021/SCL_122 +; EG-NEXT: SUBB_UINT T4.W, T9.W, T2.Z, +; EG-NEXT: SUB_INT * T2.W, T6.W, T2.W, +; EG-NEXT: SUB_INT T10.X, T9.W, T2.Z, +; EG-NEXT: SETGE_UINT T4.Y, T8.W, T0.Z, BS:VEC_120/SCL_212 +; EG-NEXT: SUB_INT * T2.Z, PS, PV.W, +; EG-NEXT: BIT_ALIGN_INT T2.W, T3.Z, T6.Z, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT * T4.W, T9.X, T1.Y, T1.Z, +; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) +; EG-NEXT: SETE_INT T9.X, PV.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SETGE_UINT T1.Y, PV.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: SUBB_UINT T0.Z, T8.W, T0.Z, +; EG-NEXT: SUB_INT T0.W, PV.W, T0.W, BS:VEC_021/SCL_122 +; EG-NEXT: CNDE_INT * T6.W, PS, T6.W, T2.Z, +; EG-NEXT: SUB_INT T11.X, PV.W, PV.Z, +; EG-NEXT: CNDE_INT T1.Y, PV.X, PV.Y, T4.Y, +; EG-NEXT: CNDE_INT T6.Z, T4.W, T9.W, T10.X, +; EG-NEXT: SUB_INT * T0.W, T2.Y, T8.X, +; EG-NEXT: CNDE_INT * T4.W, T3.Y, T4.X, T6.X, +; EG-NEXT: SUBB_UINT T4.X, T3.W, T0.X, +; EG-NEXT: CNDE_INT * T6.Y, PV.W, T7.W, T0.W, +; EG-NEXT: SUB_INT T0.Z, T1.W, T0.Y, +; EG-NEXT: SUB_INT T0.W, T10.W, T2.X, BS:VEC_210 +; EG-NEXT: CNDE_INT * T2.W, T1.Y, T2.W, T11.X, +; EG-NEXT: CNDE_INT T6.X, T4.W, T10.W, PV.W, +; EG-NEXT: SUB_INT T0.Y, PV.Z, T4.X, +; EG-NEXT: CNDE_INT * T2.Z, T1.Y, T8.W, T7.X, BS:VEC_120/SCL_212 +; EG-NEXT: CNDE_INT T0.W, T3.X, T5.X, T1.X, +; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T1.X, PS, literal.x, +; EG-NEXT: CNDE_INT T2.Y, PV.W, T1.W, T0.Y, +; EG-NEXT: SUB_INT * T1.W, T3.W, T0.X, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: CNDE_INT T2.X, T0.W, T3.W, PV.W, +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 %a = load <4 x i64>, ptr addrspace(1) %in %b = load <4 x i64>, ptr addrspace(1) %b_ptr Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -28,8 +28,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -49,7 +49,7 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -729,8 +729,8 @@ ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 @@ -750,7 +750,7 @@ ; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -912,8 +912,8 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 ; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 @@ -935,7 +935,7 @@ ; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 Index: llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -235,10 +235,10 @@ ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -251,8 +251,8 @@ ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -288,8 +288,8 @@ ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -358,9 +358,9 @@ ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -373,7 +373,7 @@ ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -409,7 +409,7 @@ ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -477,8 +477,8 @@ ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4) ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec - ; SI-NEXT: %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1 + ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) @@ -511,7 +511,7 @@ ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %40:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 Index: llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -434,7 +434,6 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 @@ -586,7 +585,6 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 @@ -723,7 +721,6 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff @@ -742,7 +739,6 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 @@ -770,9 +766,6 @@ ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 @@ -878,23 +871,23 @@ ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -918,24 +911,24 @@ ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -944,140 +937,126 @@ ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s13 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s15 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s19 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s20 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s21 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s22 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s23 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s24 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s25 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s26 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s27 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s28 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, s29 -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; kill: def $vgpr45 killed $vgpr45 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr46 killed $vgpr46 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr44 killed $vgpr44 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v47, s25 +; GFX9-O0-NEXT: v_mov_b32_e32 v46, s26 +; GFX9-O0-NEXT: v_mov_b32_e32 v45, s27 +; GFX9-O0-NEXT: v_mov_b32_e32 v44, s28 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 ; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v45 +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v47 +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v46 +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v23, v45 ; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v44 +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43 +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 ; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec @@ -1100,28 +1079,23 @@ ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0 @@ -1220,27 +1194,27 @@ ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31]