Index: include/llvm/CodeGen/FunctionLoweringInfo.h =================================================================== --- include/llvm/CodeGen/FunctionLoweringInfo.h +++ include/llvm/CodeGen/FunctionLoweringInfo.h @@ -13,7 +13,6 @@ #ifndef LLVM_CODEGEN_FUNCTIONLOWERINGINFO_H #define LLVM_CODEGEN_FUNCTIONLOWERINGINFO_H - #include "llvm/ADT/APInt.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" @@ -21,6 +20,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -57,6 +57,7 @@ const TargetLowering *TLI; MachineRegisterInfo *RegInfo; BranchProbabilityInfo *BPI; + const LegacyDivergenceAnalysis *DA; /// CanLowerReturn - true iff the function's return value can be lowered to /// registers. bool CanLowerReturn; @@ -240,9 +241,11 @@ return ValueMap.count(V); } - unsigned CreateReg(MVT VT); + unsigned CreateReg(MVT VT, bool isDivergent = false); + + unsigned CreateRegs(const Value *V); - unsigned CreateRegs(Type *Ty); + unsigned CreateRegs(Type *Ty, bool isDivergent = false); unsigned InitializeRegForValue(const Value *V) { // Tokens never live in vregs. @@ -251,7 +254,7 @@ unsigned &R = ValueMap[V]; assert(R == 0 && "Already initialized this value register!"); assert(VirtReg2Value.empty()); - return R = CreateRegs(V->getType()); + return R = CreateRegs(V); } /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -406,6 +406,7 @@ const TargetLowering &getTargetLoweringInfo() const { return *TLI; } const TargetLibraryInfo &getLibInfo() const { return *LibInfo; } const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; } + const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; } LLVMContext *getContext() const {return Context; } OptimizationRemarkEmitter &getORE() const { return *ORE; } Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -625,12 +625,21 @@ /// Return the register class that should be used for the specified value /// type. - virtual const TargetRegisterClass *getRegClassFor(MVT VT) const { + virtual const TargetRegisterClass *getRegClassFor(MVT VT, bool isDivergent = false) const { + (void)isDivergent; const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy]; assert(RC && "This value type is not natively supported!"); return RC; } + /// Allows target to decide about the register class of the + /// specific value that is live outside the defining block. + /// Returns true if the value needs uniform register class. + virtual bool requiresUniformRegister(MachineFunction &MF, + const Value *) const { + return false; + } + /// Return the 'representative' register class for the specified value /// type. /// Index: include/llvm/CodeGen/TargetRegisterInfo.h =================================================================== --- include/llvm/CodeGen/TargetRegisterInfo.h +++ include/llvm/CodeGen/TargetRegisterInfo.h @@ -520,6 +520,11 @@ /// function. Used by MachineRegisterInfo::isConstantPhysReg(). virtual bool isConstantPhysReg(unsigned PhysReg) const { return false; } + /// Returns true if the register class is considered divergent. + virtual bool isDivergentRegClass(const TargetRegisterClass *RC) const { + return false; + } + /// Physical registers that may be modified within a function but are /// guaranteed to be restored before any uses. This is useful for targets that /// have call sequences where a GOT register may be updated by the caller Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13843,9 +13843,11 @@ assert(DAG && "Missing context"); const TargetLowering &TLI = DAG->getTargetLoweringInfo(); EVT ResVT = Use->getValueType(0); - const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT()); + const TargetRegisterClass *ResRC = + TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent()); const TargetRegisterClass *ArgRC = - TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT()); + TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(), + Use->getOperand(0)->isDivergent()); if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; Index: lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp =================================================================== --- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -85,6 +85,7 @@ RegInfo = &MF->getRegInfo(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); unsigned StackAlign = TFI->getStackAlignment(); + DA = DAG->getDivergenceAnalysis(); // Check whether the function can return without sret-demotion. SmallVector Outs; @@ -345,9 +346,9 @@ } /// CreateReg - Allocate a single virtual register for the given type. -unsigned FunctionLoweringInfo::CreateReg(MVT VT) { +unsigned FunctionLoweringInfo::CreateReg(MVT VT, bool isDivergent) { return RegInfo->createVirtualRegister( - MF->getSubtarget().getTargetLowering()->getRegClassFor(VT)); + MF->getSubtarget().getTargetLowering()->getRegClassFor(VT, isDivergent)); } /// CreateRegs - Allocate the appropriate number of virtual registers of @@ -357,7 +358,7 @@ /// In the case that the given value has struct or array type, this function /// will assign registers for each member or element. /// -unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) { +unsigned FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) { const TargetLowering *TLI = MF->getSubtarget().getTargetLowering(); SmallVector ValueVTs; @@ -370,13 +371,18 @@ unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT); for (unsigned i = 0; i != NumRegs; ++i) { - unsigned R = CreateReg(RegisterVT); + unsigned R = CreateReg(RegisterVT, isDivergent); if (!FirstReg) FirstReg = R; } } return FirstReg; } +unsigned FunctionLoweringInfo::CreateRegs(const Value *V) { + return CreateRegs(V->getType(), DA && !TLI->requiresUniformRegister(*MF, V) && + DA->isDivergent(V)); +} + /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the /// register is a PHI destination and the PHI's LiveOutInfo is not valid. If /// the register's LiveOutInfo is for a smaller bit width, it is extended to Index: lib/CodeGen/SelectionDAG/InstrEmitter.h =================================================================== --- lib/CodeGen/SelectionDAG/InstrEmitter.h +++ lib/CodeGen/SelectionDAG/InstrEmitter.h @@ -83,7 +83,7 @@ /// supports SubIdx sub-registers. Emit a copy if that isn't possible. /// Return the virtual register to use. unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx, MVT VT, - const DebugLoc &DL); + bool isDivergent, const DebugLoc &DL); /// EmitSubregNode - Generate machine code for subreg nodes. /// Index: lib/CodeGen/SelectionDAG/InstrEmitter.cpp =================================================================== --- lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -105,7 +105,7 @@ // Stick to the preferred register classes for legal types. if (TLI->isTypeLegal(VT)) - UseRC = TLI->getRegClassFor(VT); + UseRC = TLI->getRegClassFor(VT, Node->isDivergent()); if (!IsClone && !IsCloned) for (SDNode *User : Node->uses()) { @@ -164,7 +164,7 @@ "Incompatible phys register def and uses!"); DstRC = UseRC; } else { - DstRC = TLI->getRegClassFor(VT); + DstRC = TLI->getRegClassFor(VT, Node->isDivergent()); } // If all uses are reading from the src physical register and copying the @@ -225,8 +225,9 @@ // type correctly. For example, a 64-bit float (X86::FR64) can't live in // the 32-bit float super-class (X86::FR32). if (i < NumResults && TLI->isTypeLegal(Node->getSimpleValueType(i))) { - const TargetRegisterClass *VTRC = - TLI->getRegClassFor(Node->getSimpleValueType(i)); + const TargetRegisterClass *VTRC = TLI->getRegClassFor( + Node->getSimpleValueType(i), + Node->isDivergent() || (RC && TRI->isDivergentRegClass(RC))); if (RC) VTRC = TRI->getCommonSubClass(RC, VTRC); if (VTRC) @@ -290,7 +291,8 @@ // does not include operand register class info. if (!VReg) { const TargetRegisterClass *RC = - TLI->getRegClassFor(Op.getSimpleValueType()); + TLI->getRegClassFor(Op.getSimpleValueType(), + Op.getNode()->isDivergent()); VReg = MRI->createVirtualRegister(RC); } BuildMI(*MBB, InsertPos, Op.getDebugLoc(), @@ -395,11 +397,13 @@ } else if (RegisterSDNode *R = dyn_cast(Op)) { unsigned VReg = R->getReg(); MVT OpVT = Op.getSimpleValueType(); - const TargetRegisterClass *OpRC = - TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT) : nullptr; const TargetRegisterClass *IIRC = II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF)) : nullptr; + const TargetRegisterClass *OpRC = + TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT, + (IIRC && TRI->isDivergentRegClass(IIRC)) || + Op.getNode()->isDivergent()) : nullptr; if (OpRC && IIRC && OpRC != IIRC && TargetRegisterInfo::isVirtualRegister(VReg)) { @@ -464,7 +468,7 @@ } unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx, - MVT VT, const DebugLoc &DL) { + MVT VT, bool isDivergent, const DebugLoc &DL) { const TargetRegisterClass *VRC = MRI->getRegClass(VReg); const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx); @@ -479,7 +483,7 @@ // VReg couldn't be reasonably constrained. Emit a COPY to a new virtual // register instead. - RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT), SubIdx); + RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT, isDivergent), SubIdx); assert(RC && "No legal register class for VT supports that SubIdx"); unsigned NewReg = MRI->createVirtualRegister(RC); BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg) @@ -514,7 +518,7 @@ // classes. unsigned SubIdx = cast(Node->getOperand(1))->getZExtValue(); const TargetRegisterClass *TRC = - TLI->getRegClassFor(Node->getSimpleValueType(0)); + TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent()); unsigned Reg; MachineInstr *DefMI; @@ -548,8 +552,7 @@ if (TargetRegisterInfo::isVirtualRegister(Reg)) Reg = ConstrainForSubReg(Reg, SubIdx, Node->getOperand(0).getSimpleValueType(), - Node->getDebugLoc()); - + Node->isDivergent(), Node->getDebugLoc()); // Create the destreg if it is missing. if (VRBase == 0) VRBase = MRI->createVirtualRegister(TRC); @@ -584,7 +587,8 @@ // // There is no constraint on the %src register class. // - const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getSimpleValueType(0)); + const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getSimpleValueType(0), + Node->isDivergent()); SRC = TRI->getSubClassWithSubReg(SRC, SubIdx); assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG"); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9691,7 +9691,7 @@ if (const Constant *C = dyn_cast(PHIOp)) { unsigned &RegOut = ConstantsOut[C]; if (RegOut == 0) { - RegOut = FuncInfo.CreateRegs(C->getType()); + RegOut = FuncInfo.CreateRegs(C); CopyValueToVirtualRegister(C, RegOut); } Reg = RegOut; @@ -9704,7 +9704,7 @@ assert(isa(PHIOp) && FuncInfo.StaticAllocaMap.count(cast(PHIOp)) && "Didn't codegen value into a register!??"); - Reg = FuncInfo.CreateRegs(PHIOp->getType()); + Reg = FuncInfo.CreateRegs(PHIOp); CopyValueToVirtualRegister(PHIOp, Reg); } } Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -723,6 +723,8 @@ if (!(isa(this) || (isa(this)))) OS << " # D:" << isDivergent(); + OS << " t" << PersistentId; + if (G && !G->GetDbgValues(this).empty()) { OS << " [NoOfDbgValues=" << G->GetDbgValues(this).size() << ']'; for (SDDbgValue *Dbg : G->GetDbgValues(this)) Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1741,7 +1741,7 @@ !Inst->use_empty()) { unsigned &R = FuncInfo->ValueMap[Inst]; if (!R) - R = FuncInfo->CreateRegs(Inst->getType()); + R = FuncInfo->CreateRegs(Inst); } bool HadTailCall = false; Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -621,63 +621,70 @@ break; } case AMDGPU::PHI: { - unsigned Reg = MI.getOperand(0).getReg(); - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // We don't need to fix the PHI if the common dominator of the - // two incoming blocks terminates with a uniform branch. - bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { - MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); - MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - - if (!predsHasDivergentTerminator(MBB0, TRI) && - !predsHasDivergentTerminator(MBB1, TRI)) { - LLVM_DEBUG(dbgs() - << "Not fixing PHI for uniform branch: " << MI << '\n'); + unsigned hasVGPRUses = 0; + SetVector worklist; + worklist.insert(&MI); + while (!worklist.empty()) { + MachineInstr *Instr = worklist.pop_back_val(); + unsigned Reg = Instr->getOperand(0).getReg(); + for (auto &Use : MRI.use_operands(Reg)) { + MachineInstr *UseMI = Use.getParent(); + if (UseMI->isCopy() || UseMI->isRegSequence()) { + if (UseMI->isCopy()) { + unsigned Reg = UseMI->getOperand(0).getReg(); + if (TRI->isPhysicalRegister(Reg)) { + if (!TRI->isSGPRReg(MRI, Reg)) { + hasVGPRUses++; + } + } + } + worklist.insert(UseMI); + continue; + } + + unsigned OpNo = UseMI->getOperandNo(&Use); + const MCInstrDesc &Desc = TII->get(UseMI->getOpcode()); + if (Desc.OpInfo && Desc.OpInfo[OpNo].RegClass != -1) { + const TargetRegisterClass *OpRC = + TRI->getRegClass(Desc.OpInfo[OpNo].RegClass); + if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && + OpRC != &AMDGPU::VS_64RegClass) { + hasVGPRUses++; + } + } + } + } + bool hasVGPRInput = false; + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + unsigned InputReg = MI.getOperand(i).getReg(); + MachineInstr *Def = MRI.getVRegDef(InputReg); + if (TRI->isVGPR(MRI, InputReg)) { + if (Def->isCopy()) { + unsigned SrcReg = Def->getOperand(1).getReg(); + const TargetRegisterClass *RC = + TRI->isVirtualRegister(SrcReg) ? MRI.getRegClass(SrcReg) + : TRI->getPhysRegClass(SrcReg); + if (TRI->isSGPRClass(RC)) + continue; + } + hasVGPRInput = true; + break; + } else if (Def->isCopy() && + TRI->isVGPR(MRI, Def->getOperand(1).getReg())) { + hasVGPRInput = true; break; } } + unsigned PHIRes = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC0 = MRI.getRegClass(PHIRes); - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - SmallSet Visited; - if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { - LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI, MDT); + if ((!TRI->isVGPR(MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) && + (hasVGPRInput || hasVGPRUses > 1)) { + TII->moveToVALU(MI); + } else { + TII->legalizeOperands(MI, MDT); } + break; } case AMDGPU::REG_SEQUENCE: Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -367,7 +367,11 @@ bool SNaN = false, unsigned Depth = 0) const override; AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; -}; + virtual const TargetRegisterClass *getRegClassFor(MVT VT, + bool isDivergent) const override; + virtual bool requiresUniformRegister(MachineFunction &MF, + const Value * V) const override; + }; } // End namespace llvm Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -9547,7 +9547,8 @@ break; MVT VT = Src0.getValueType().getSimpleVT(); - const TargetRegisterClass *RC = getRegClassFor(VT); + const TargetRegisterClass *RC = getRegClassFor(VT, + Src0.getNode()->isDivergent()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); @@ -9989,28 +9990,116 @@ SNaN, Depth); } -TargetLowering::AtomicExpansionKind -SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { - switch (RMW->getOperation()) { - case AtomicRMWInst::FAdd: { - Type *Ty = RMW->getType(); +const TargetRegisterClass * +SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) + return &AMDGPU::SReg_64RegClass; + if (!TRI->isSGPRClass(RC) && !isDivergent) + return TRI->getEquivalentSGPRClass(RC); + else if (TRI->isSGPRClass(RC) && isDivergent) + return TRI->getEquivalentVGPRClass(RC); - // We don't have a way to support 16-bit atomics now, so just leave them - // as-is. - if (Ty->isHalfTy()) - return AtomicExpansionKind::None; + return RC; +} - if (!Ty->isFloatTy()) - return AtomicExpansionKind::CmpXChg; +static bool hasIfBreakUser(const Value *V, SetVector &Visited) { + if (Visited.count(V)) + return false; + Visited.insert(V); + bool Result = false; + for (auto U : V->users()) { + if (const IntrinsicInst *Intrinsic = dyn_cast(U)) { + if ((Intrinsic->getIntrinsicID() == Intrinsic::amdgcn_if_break) && + (V == U->getOperand(1))) + Result = true; + } else { + Result = hasIfBreakUser(U, Visited); + } + if (Result) + break; + } + return Result; +} - // TODO: Do have these for flat. Older targets also had them for buffers. - unsigned AS = RMW->getPointerAddressSpace(); - return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? - AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; +bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, + const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if_break: + return true; + } } - default: - break; + if (const ExtractValueInst *ExtValue = dyn_cast(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) { + return true; + } + } + } + } } + if (const CallInst *CI = dyn_cast(V)) { + if (isa(CI->getCalledValue())) { + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( + MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + for (auto &TC : TargetConstraints) { + if (TC.Type == InlineAsm::isOutput) { + ComputeConstraintToUse(TC, SDValue()); + unsigned AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, + getSimpleValueType(MF.getDataLayout(), CS.getType())); + if (RC) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) + return true; + else if (SIRI->isSGPRClass(RC)) + return true; + } + } + } + } + } + SetVector Visited; + return hasIfBreakUser(V, Visited); +} - return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + switch (RMW->getOperation()) { + case AtomicRMWInst::FAdd: { + Type *Ty = RMW->getType(); + + // We don't have a way to support 16-bit atomics now, so just leave them + // as-is. + if (Ty->isHalfTy()) + return AtomicExpansionKind::None; + + if (!Ty->isFloatTy()) + return AtomicExpansionKind::CmpXChg; + + // TODO: Do have these for flat. Older targets also had them for buffers. + unsigned AS = RMW->getPointerAddressSpace(); + return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? + AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + } + default: + break; + } + + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5130,7 +5130,7 @@ case AMDGPU::INSERT_SUBREG: case AMDGPU::WQM: case AMDGPU::WWM: - if (RI.hasVGPRs(NewDstRC)) + if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -195,6 +195,11 @@ unsigned Reg) const; bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + virtual bool isDivergentRegClass(const TargetRegisterClass * RC) const override + { + return !isSGPRClass(RC); + } + bool isSGPRPressureSet(unsigned SetID) const { return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID); } Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -456,7 +456,8 @@ /// getRegClassFor - Return the register class that should be used for the /// specified value type. - const TargetRegisterClass *getRegClassFor(MVT VT) const override; + const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent = false) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1429,7 +1429,9 @@ /// getRegClassFor - Return the register class that should be used for the /// specified value type. -const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { +const TargetRegisterClass * +ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + (void)isDivergent; // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive D registers. Index: test/CodeGen/AMDGPU/atomicrmw-nand.ll =================================================================== --- test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -5,11 +5,12 @@ ; GCN-LABEL: atomic_nand_i32_lds: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: BB0_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_not_b32_e32 v1, v2 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17,7 +18,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB0_1 @@ -33,11 +33,12 @@ ; GCN-LABEL: atomic_nand_i32_global: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_dword v3, v[0:1], off +; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: BB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -45,7 +46,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB1_1 @@ -61,11 +61,12 @@ ; GCN-LABEL: atomic_nand_i32_flat: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_dword v3, v[0:1] +; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: BB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -74,7 +75,6 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] ; GCN-NEXT: s_cbranch_execnz BB2_1 Index: test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- test/CodeGen/AMDGPU/branch-relaxation.ll +++ test/CodeGen/AMDGPU/branch-relaxation.ll @@ -99,7 +99,7 @@ ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch: ; GCN: s_load_dword [[CND:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] + ; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0 ; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]] ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]] @@ -117,6 +117,7 @@ ; GCN: v_nop_e64 ; GCN: [[ENDBB]]: +; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { Index: test/CodeGen/AMDGPU/branch-uniformity.ll =================================================================== --- test/CodeGen/AMDGPU/branch-uniformity.ll +++ test/CodeGen/AMDGPU/branch-uniformity.ll @@ -8,8 +8,8 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: ; %LOOP49 -; CHECK: v_cmp_ne_u32_e32 vcc, -; CHECK: s_cbranch_vccnz +; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; CHECK: s_cbranch_scc1 ; CHECK: ; %ENDIF53 define amdgpu_vs float @main(i32 %in) { main_body: Index: test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -89,7 +89,7 @@ } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 12{{$}} ; GCN: {{^}}; %bb.0: @@ -123,10 +123,9 @@ ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] -; GCN: v_cmp_ne_u32_e32 vcc, -; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cmp_lg_u32 ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN-NEXT: s_cbranch_vccnz [[LOOP]] +; GCN-NEXT: s_cbranch_scc1 [[LOOP]] ; GCN: [[END]]: Index: test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll =================================================================== --- test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -13,55 +13,50 @@ ; CHECK: ; %bb.0: ; %start ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 m0, s0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x -; CHECK-NEXT: v_cmp_nlt_f32_e64 s[0:1], 0, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3 -; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 +; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[2:3], 0 +; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5 ; CHECK-NEXT: BB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 32, v1 -; CHECK-NEXT: s_and_b64 vcc, exec, vcc -; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec -; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: s_cbranch_vccz BB0_5 +; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], exec +; CHECK-NEXT: s_cmp_lt_u32 s0, 32 +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_cbranch_scc0 BB0_5 ; CHECK-NEXT: ; %bb.2: ; %endif1 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_mov_b64 s[6:7], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[0:1] -; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; CHECK-NEXT: s_mov_b64 s[4:5], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; CHECK-NEXT: ; mask branch BB0_4 ; CHECK-NEXT: BB0_3: ; %endif2 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 -; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 +; CHECK-NEXT: s_add_i32 s0, s0, 1 +; CHECK-NEXT: s_xor_b64 s[4:5], exec, -1 ; CHECK-NEXT: BB0_4: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: s_branch BB0_6 -; CHECK-NEXT: BB0_5: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: ; implicit-def: $vgpr1 -; CHECK-NEXT: BB0_6: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], 0 +; CHECK-NEXT: BB0_5: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] -; CHECK-NEXT: s_mov_b64 s[4:5], s[8:9] +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[4:5] +; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[2:3] +; CHECK-NEXT: s_mov_b64 s[2:3], s[8:9] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_cbranch_execnz BB0_1 -; CHECK-NEXT: ; %bb.7: ; %Flow2 +; CHECK-NEXT: ; %bb.6: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; this is the divergent branch with the condition not marked as divergent -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; CHECK-NEXT: ; mask branch BB0_9 -; CHECK-NEXT: BB0_8: ; %if1 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] +; CHECK-NEXT: ; mask branch BB0_8 +; CHECK-NEXT: BB0_7: ; %if1 ; CHECK-NEXT: v_sqrt_f32_e32 v1, v0 -; CHECK-NEXT: BB0_9: ; %endloop +; CHECK-NEXT: BB0_8: ; %endloop ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; CHECK-NEXT: s_endpgm +; this is the divergent branch with the condition not marked as divergent start: %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0) br label %loop Index: test/CodeGen/AMDGPU/fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.ll +++ test/CodeGen/AMDGPU/fabs.ll @@ -48,8 +48,8 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: v_and_b32 -; GCN: v_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -62,10 +62,10 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out Index: test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -85,15 +85,15 @@ ; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -121,15 +121,15 @@ } ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} @@ -156,15 +156,15 @@ } ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} @@ -194,15 +194,15 @@ ; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -231,8 +231,6 @@ } ; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp: -; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} @@ -240,9 +238,12 @@ ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 + +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -273,8 +274,6 @@ } ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: -; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} @@ -282,9 +281,12 @@ ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 + +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- test/CodeGen/AMDGPU/fmin_legacy.ll +++ test/CodeGen/AMDGPU/fmin_legacy.ll @@ -33,9 +33,13 @@ ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] +; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] -; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]] +; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] + +; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[B]], [[VA]] ; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] ; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]] Index: test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.ll +++ test/CodeGen/AMDGPU/fneg-fabs.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}| +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs @@ -15,7 +15,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: ; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}| +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| ; SI-NOT: and define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) @@ -85,8 +85,8 @@ ; FIXME: In this case two uses of the constant should be folded ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -96,10 +96,10 @@ ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs Index: test/CodeGen/AMDGPU/fsub.ll =================================================================== --- test/CodeGen/AMDGPU/fsub.ll +++ test/CodeGen/AMDGPU/fsub.ll @@ -27,8 +27,8 @@ ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { %sub = fsub <2 x float> %a, %b store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 @@ -55,10 +55,10 @@ } ; FUNC-LABEL: {{^}}s_fsub_v4f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: s_endpgm define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { %result = fsub <4 x float> %a, %b Index: test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -4,17 +4,11 @@ ; SI-LABEL: {{^}}i1_copy_from_loop: ; ; SI: ; %for.body -; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4, -; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec -; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec -; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]] - -; SI: ; %Flow1 -; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec +; SI: v_cmp_lt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], s{{[0-9+]}}, 4 ; SI: ; %Flow ; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec -; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec +; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec ; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]] ; SI: ; %for.end Index: test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll =================================================================== --- test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll +++ test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll @@ -7,7 +7,6 @@ ; GCN: s_cbranch_scc1 [[PREEXIT:BB[0-9_]+]] ; GCN: ; %blocka -; GCN: s_xor_b64 s[{{[0-9:]+}}], exec, -1 ; GCN: s_cmp_eq_u32 s1, 0 ; GCN: s_cbranch_scc1 [[EXIT:BB[0-9_]+]] Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -11,12 +11,12 @@ ; GCN-LABEL: {{^}}insertelement_v4f32_0: ; GCN: s_load_dwordx4 +; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] + ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000 -; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]] ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]: define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -387,7 +387,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val: ; SI-NOT: v0 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, v0, v0, v0 +; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s0, s0, v0 define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -53,8 +53,8 @@ } ; GCN-LABEL: {{^}}test_fneg_fmed3_rr_0: -; GCN: s_brev_b32 [[NEG0:s[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] +; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 +; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] define amdgpu_kernel void @test_fneg_fmed3_rr_0(float addrspace(1)* %out, float %src0, float %src1) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0) %neg.med3 = fsub float -0.0, %med3 @@ -88,8 +88,8 @@ ; GCN-LABEL: {{^}}test_fneg_fmed3_r_inv2pi_0_foldable_user: ; GCN-DAG: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 -; GCN-DAG: s_mov_b32 [[NEG_INV:s[0-9]+]], 0xbe22f983 -; GCN: v_med3_f32 [[MED3:v[0-9]+]], -v{{[0-9]+}}, [[NEG_INV]], [[NEG0]] +; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983 +; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]] ; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]] define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(float addrspace(1)* %out, float %src0, float %mul.arg) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0) Index: test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -42,6 +42,8 @@ ; VI-OPT: s_mov_b32 ; VI-OPT: s_mov_b32 ; VI-NOOPT: s_waitcnt +; VI-NOOPT-NEXT: v_mov_b32_e32 +; VI-NOOPT-NEXT: s_nop 0 ; VI-NOOPT-NEXT: s_nop 0 ; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; VI-OPT: s_nop 1 Index: test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: -; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] +; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: -; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] +; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -26,10 +26,9 @@ ; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 -; GCN: v_cmp_lt_i32_e32 vcc, -1 -; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec -; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec +; GCN: s_cmp_gt_i32 s6, -1 +; GCN: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; %bb4 ; GCN: buffer_load_dword @@ -39,6 +38,7 @@ ; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]] ; GCN: [[FLOW]]: ; %Flow +; GCN: ; in Loop: Header=BB0_1 Depth=1 ; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]] ; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]] ; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]] Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -211,10 +211,10 @@ ; because the implicit immediate already uses the constant bus. ; GCN-LABEL: {{^}}madak_constant_bus_violation: ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} -; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] -; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] +; GCN: v_mov_b32_e32 [[MAC:v[0-9]+]], 0x42280000 +; GCN: v_mac_f32_e64 [[MAC]], [[SGPR0]], 0.5 +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MAC]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] ; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 { Index: test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -155,8 +155,9 @@ ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]] ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]] - -; CHECK-O0: s_mov_b64 exec, [[SAVEEXEC]] +; CHECK-O0: v_readlane_b32 s[[S1:[0-9]+]], v{{[0-9]+}}, 4 +; CHECK-O0: v_readlane_b32 s[[S2:[0-9]+]], v{{[0-9]+}}, 5 +; CHECK-O0: s_mov_b64 exec, s{{\[}}[[S1]]:[[S2]]{{\]}} ; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload ; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -96,7 +96,6 @@ ; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]] ; GCN: ; %LeafBlock1 -; GCN: s_mov_b64 ; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN: ; %case1 @@ -109,8 +108,6 @@ ; GCN: s_mov_b64 [[BREAK]], -1{{$}} -; GCN: [[FLOW]]: ; %Flow - ; GCN: ; %case0 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], ; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec @@ -118,7 +115,7 @@ ; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec ; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]] -; GCN: ; %Flow4 +; GCN: [[FLOW]]: ; %Flow4 ; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]] ; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]] ; GCN: s_andn2_b64 exec, exec, [[LEFT]] Index: test/CodeGen/AMDGPU/select-opt.ll =================================================================== --- test/CodeGen/AMDGPU/select-opt.ll +++ test/CodeGen/AMDGPU/select-opt.ll @@ -135,8 +135,8 @@ ; GCN-LABEL: {{^}}regression: ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 -; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}} +; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 +; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -104,7 +104,8 @@ ; SI: ; %else ; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]] +; SI: v_cmp_gt_i32_e32 vcc, 0, [[AVAL]] +; SI: s_and_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], vcc, exec ; SI: ; %if ; SI: buffer_load_dword [[AVAL:v[0-9]+]] Index: test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir =================================================================== --- test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -16,7 +16,7 @@ body: | ; GCN-LABEL: name: phi_visit_order - ; GCN: V_ADD_I32 + ; GCN: S_ADD_I32 bb.0: liveins: $vgpr0 %7 = COPY $vgpr0 Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -571,7 +571,6 @@ ; ; TODO: we should keep the loop counter in an SGPR ; -; GCN: v_readfirstlane_b32 ; GCN: s_buffer_load_dword define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { main_body: Index: test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -1,28 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s ; Don't crash when the use of an undefined value is only detected by the ; register coalescer because it is hidden with subregister insert/extract. target triple="amdgcn--" -; CHECK-LABEL: foobar: -; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc - -; CHECK: BB0_1: -; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr4_sgpr5 killed $exec -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 - -; CHECK: BB0_2: -; CHECK: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { +; CHECK-LABEL: foobar: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) + +; FIXME: The change related to the fact that +; DetectDeadLanes pass hit "Copy across incompatible class" SGPR -> VGPR in analysis +; and hence it cannot derive the fact that the vector element is unused. +; Such a copies appear because the float4 vectors and their elements in the test are uniform +; but the PHI node in "ife" block is divergent because of the CF dependency (divergent branch in bb0) + +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 + +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: ; mask branch BB0_2 +; CHECK-NEXT: BB0_1: ; %ift +; CHECK-NEXT: s_mov_b32 s4, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: BB0_2: ; %ife +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 Index: test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -7,10 +7,9 @@ ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} -; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader -; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: -; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]] +; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: ; %loop_body +; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]] ; CHECK: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { Index: test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -226,13 +226,12 @@ ; GCN-LABEL: {{^}}test_s0_s1_k_f32: ; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: s_mov_b32 [[SK0:s[0-9]+]], 0x44800000 +; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]] -; GCN-DAG: v_mov_b32_e32 [[VS0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS0]], [[VS1]], [[SK0]] -; GCN-DAG: s_mov_b32 [[SK1:s[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VS0]], [[VS1]], [[SK1]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]] +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -162,8 +162,8 @@ ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 -; SI: s_cbranch_vccz [[LABEL_LOOP]] +; SI-DAG: s_cmpk_eq_i32 s{{[0-9+]}}, 0x100 +; SI: s_cbranch_scc0 [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm @@ -211,7 +211,7 @@ ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] -; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] +; SI: ; mask branch [[LABEL_FLOW:BB[0-9]+_[0-9]+]] ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -22,25 +22,25 @@ ; MESA-NOT: s_mov_b32 s3 ; HSA-NOT: s_mov_b32 s7 -; GCNMESA-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 -; GCNMESA-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 -; GCNMESA-DAG: s_mov_b32 s18, -1 -; SIMESA-DAG: s_mov_b32 s19, 0xe8f000 -; VIMESA-DAG: s_mov_b32 s19, 0xe80000 -; GFX9MESA-DAG: s_mov_b32 s19, 0xe00000 +; GCNMESA-DAG: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 +; GCNMESA-DAG: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 +; GCNMESA-DAG: s_mov_b32 s22, -1 +; SIMESA-DAG: s_mov_b32 s23, 0xe8f000 +; VIMESA-DAG: s_mov_b32 s23, 0xe80000 +; GFX9MESA-DAG: s_mov_b32 s23, 0xe00000 ; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[20:23], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[20:23], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[20:23], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[20:23], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[20:23], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[20:23], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[20:23], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[20:23], s3 offset:{{[0-9]+}} @@ -69,9 +69,11 @@ %tmp9 = extractelement <4 x float> %arg6, i32 2 %tmp10 = extractelement <4 x float> %arg6, i32 3 %tmp11 = bitcast float %arg5 to i32 + %cat = fadd float %arg3, %tmp10 br label %bb12 bb12: ; preds = %bb145, %bb + %catphi = phi float [ %cat, %bb ], [ %catsome, %bb145 ] %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ] %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ] %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ] @@ -339,6 +341,7 @@ store volatile float %tmp8, float addrspace(1)* %arg store volatile float %tmp9, float addrspace(1)* %arg store volatile float %tmp10, float addrspace(1)* %arg + store volatile float %catphi, float addrspace(1)* %arg ret void bb145: ; preds = %bb12 @@ -604,6 +607,7 @@ %tmp405 = extractelement <128 x float> %tmp278, i32 126 %tmp406 = extractelement <128 x float> %tmp278, i32 127 %tmp407 = bitcast float %tmp95 to i32 + %catsome = extractelement <128 x float> %tmp278, i32 %tmp407 %tmp408 = add i32 %tmp407, 1 %tmp409 = bitcast i32 %tmp408 to float br label %bb12