Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -113,10 +113,16 @@ public: static char ID; + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const SIInstrInfo *TII; + SIFixSGPRCopies() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; + void processPHINode(MachineInstr &MI); + StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -313,52 +319,6 @@ return true; } -static bool phiHasVGPROperands(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - const SIRegisterInfo *TRI, - const SIInstrInfo *TII) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - Register Reg = PHI.getOperand(i).getReg(); - if (TRI->hasVGPRs(MRI.getRegClass(Reg))) - return true; - } - return false; -} - -static bool phiHasBreakDef(const MachineInstr &PHI, - const MachineRegisterInfo &MRI, - SmallSet &Visited) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { - Register Reg = PHI.getOperand(i).getReg(); - if (Visited.count(Reg)) - continue; - - Visited.insert(Reg); - - MachineInstr *DefInstr = MRI.getVRegDef(Reg); - switch (DefInstr->getOpcode()) { - default: - break; - case AMDGPU::SI_IF_BREAK: - return true; - case AMDGPU::PHI: - if (phiHasBreakDef(*DefInstr, MRI, Visited)) - return true; - } - } - return false; -} - -static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB, - const TargetRegisterInfo &TRI) { - for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(), - E = MBB.end(); I != E; ++I) { - if (I->modifiesRegister(AMDGPU::EXEC, &TRI)) - return true; - } - return false; -} - static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, const MachineInstr *MoveImm, const SIInstrInfo *TII, @@ -420,12 +380,6 @@ return false; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { - return hasTerminatorThatModifiesExec(*MBB, *TRI); }); -} - // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -633,9 +587,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const SIInstrInfo *TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); MDT = &getAnalysis(); SmallVector Worklist; @@ -657,7 +611,7 @@ Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; - std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); if (!Register::isVirtualRegister(DstReg)) { // If the destination register is a physical register there isn't @@ -666,7 +620,7 @@ // the first lane. Insert a readfirstlane and hope for the best. if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { Register TmpReg - = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) @@ -684,7 +638,7 @@ break; } - MachineInstr *DefMI = MRI.getVRegDef(SrcReg); + MachineInstr *DefMI = MRI->getVRegDef(SrcReg); unsigned SMovOp; int64_t Imm; // If we are just copying an immediate, we can replace the copy with @@ -703,70 +657,13 @@ break; } case AMDGPU::PHI: { - Register Reg = MI.getOperand(0).getReg(); - if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) - break; - - // We don't need to fix the PHI if the common dominator of the - // two incoming blocks terminates with a uniform branch. - bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { - MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); - MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - - if (!predsHasDivergentTerminator(MBB0, TRI) && - !predsHasDivergentTerminator(MBB1, TRI)) { - LLVM_DEBUG(dbgs() - << "Not fixing PHI for uniform branch: " << MI << '\n'); - break; - } - } - - // If a PHI node defines an SGPR and any of its operands are VGPRs, - // then we need to move it to the VALU. - // - // Also, if a PHI node defines an SGPR and has all SGPR operands - // we must move it to the VALU, because the SGPR operands will - // all end up being assigned the same register, which means - // there is a potential for a conflict if different threads take - // different control flow paths. - // - // For Example: - // - // sgpr0 = def; - // ... - // sgpr1 = def; - // ... - // sgpr2 = PHI sgpr0, sgpr1 - // use sgpr2; - // - // Will Become: - // - // sgpr2 = def; - // ... - // sgpr2 = def; - // ... - // use sgpr2 - // - // The one exception to this rule is when one of the operands - // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK - // instruction. In this case, there we know the program will - // never enter the second block (the loop) without entering - // the first block (where the condition is computed), so there - // is no chance for values to be over-written. - - SmallSet Visited; - if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { - LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI, MDT); - } - + processPHINode(MI); break; } case AMDGPU::REG_SEQUENCE: if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || !hasVectorOperands(MI, TRI)) { - foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); + foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); continue; } @@ -776,9 +673,9 @@ break; case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVectorRegisters(Src0RC) || TRI->hasVectorRegisters(Src1RC))) { @@ -792,7 +689,78 @@ } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) - hoistAndMergeSGPRInits(AMDGPU::M0, MRI, TRI, *MDT, TII); + hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); return true; } + +void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { + unsigned numVGPRUses = 0; + SetVector worklist; + worklist.insert(&MI); + while (!worklist.empty()) { + const MachineInstr *Instr = worklist.pop_back_val(); + unsigned Reg = Instr->getOperand(0).getReg(); + for (const auto &Use : MRI->use_operands(Reg)) { + const MachineInstr *UseMI = Use.getParent(); + if (UseMI->isCopy() || UseMI->isRegSequence()) { + if (UseMI->isCopy() && + UseMI->getOperand(0).getReg().isPhysical() && + !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) { + numVGPRUses++; + } + worklist.insert(UseMI); + continue; + } + + if (UseMI->isPHI()) { + const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg()); + if (!TRI->isSGPRReg(*MRI, Use.getReg()) && + UseRC != &AMDGPU::VReg_1RegClass) + numVGPRUses++; + continue; + } + + const TargetRegisterClass *OpRC = + TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use)); + if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass && + OpRC != &AMDGPU::VS_64RegClass) { + numVGPRUses++; + } + } + } + bool hasVGPRInput = false; + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + unsigned InputReg = MI.getOperand(i).getReg(); + MachineInstr *Def = MRI->getVRegDef(InputReg); + if (TRI->isVGPR(*MRI, InputReg)) { + if (Def->isCopy()) { + unsigned SrcReg = Def->getOperand(1).getReg(); + const TargetRegisterClass *RC = + TRI->getRegClassForReg(*MRI, SrcReg); + if (TRI->isSGPRClass(RC)) + continue; + } + hasVGPRInput = true; + break; + } + else if (Def->isCopy() && + TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) { + hasVGPRInput = true; + break; + } + } + unsigned PHIRes = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes); + + if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) && + (hasVGPRInput || numVGPRUses > 1)) { + LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); + TII->moveToVALU(MI); + } + else { + LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI); + TII->legalizeOperands(MI, MDT); + } + +} Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -386,6 +386,10 @@ unsigned Depth = 0) const override; AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + virtual const TargetRegisterClass * + getRegClassFor(MVT VT, bool isDivergent) const override; + virtual bool requiresUniformRegister(MachineFunction &MF, + const Value *V) const override; Align getPrefLoopAlignment(MachineLoop *ML) const override; void allocateHSAUserSGPRs(CCState &CCInfo, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10884,3 +10884,110 @@ return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } + +const TargetRegisterClass * +SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) + return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass + : &AMDGPU::SReg_32RegClass; + if (!TRI->isSGPRClass(RC) && !isDivergent) + return TRI->getEquivalentSGPRClass(RC); + else if (TRI->isSGPRClass(RC) && isDivergent) + return TRI->getEquivalentVGPRClass(RC); + + return RC; +} + +static bool hasCFUser(const Value *V, SmallPtrSet &Visited) { + if (!Visited.insert(V).second) + return false; + bool Result = false; + for (auto U : V->users()) { + if (const IntrinsicInst *Intrinsic = dyn_cast(U)) { + if (V == U->getOperand(1)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_if_break: + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: + Result = true; + break; + } + } + if (V == U->getOperand(0)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_loop: + Result = true; + break; + } + } + } else { + Result = hasCFUser(U, Visited); + } + if (Result) + break; + } + return Result; +} + +bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, + const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if_break: + return true; + } + } + if (const ExtractValueInst *ExtValue = dyn_cast(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) { + return true; + } + } + } + } + } + if (const CallInst *CI = dyn_cast(V)) { + if (isa(CI->getCalledValue())) { + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( + MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + for (auto &TC : TargetConstraints) { + if (TC.Type == InlineAsm::isOutput) { + ComputeConstraintToUse(TC, SDValue()); + unsigned AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, TC.ConstraintVT); + if (RC) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) + return true; + else if (SIRI->isSGPRClass(RC)) + return true; + } + } + } + } + } + SmallPtrSet Visited; + return hasCFUser(V, Visited); +} Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4222,7 +4222,7 @@ return; // Try to eliminate the copy if it is copying an immediate value. - if (Def->isMoveImmediate()) + if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass) FoldImmediate(*Copy, *Def, OpReg, &MRI); bool ImpDef = Def->isImplicitDef(); @@ -4480,8 +4480,12 @@ if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); - VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC) - : RI.getEquivalentVGPRClass(SRC); + if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) { + VRC = &AMDGPU::VReg_1RegClass; + } else + VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) + ? RI.getEquivalentAGPRClass(SRC) + : RI.getEquivalentVGPRClass(SRC); } RC = VRC; } else { @@ -5679,7 +5683,7 @@ if (!NewDstRC) return nullptr; } else { - if (RI.hasVGPRs(NewDstRC)) + if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass) return nullptr; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); Index: llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -5,11 +5,12 @@ ; GCN-LABEL: atomic_nand_i32_lds: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: ds_read_b32 v1, v0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: BB0_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_not_b32_e32 v1, v2 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -17,7 +18,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB0_1 @@ -33,11 +33,12 @@ ; GCN-LABEL: atomic_nand_i32_global: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: global_load_dword v3, v[0:1], off +; GCN-NEXT: global_load_dword v2, v[0:1], off ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: BB1_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -45,7 +46,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB1_1 @@ -61,11 +61,12 @@ ; GCN-LABEL: atomic_nand_i32_flat: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: flat_load_dword v3, v[0:1] +; GCN-NEXT: flat_load_dword v2, v[0:1] ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: BB2_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: v_not_b32_e32 v2, v3 ; GCN-NEXT: v_or_b32_e32 v2, -5, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -74,7 +75,6 @@ ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v2 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz BB2_1 Index: llvm/test/CodeGen/AMDGPU/branch-relaxation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -99,7 +99,7 @@ ; GCN-LABEL: {{^}}uniform_conditional_min_long_forward_vcnd_branch: ; GCN: s_load_dword [[CND:s[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] + ; GCN-DAG: v_cmp_eq_f32_e64 [[UNMASKED:s\[[0-9]+:[0-9]+\]]], [[CND]], 0 ; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]] ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]] @@ -117,6 +117,7 @@ ; GCN: v_nop_e64 ; GCN: [[ENDBB]]: +; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]] ; GCN: buffer_store_dword [[V_CND]] ; GCN: s_endpgm define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 { Index: llvm/test/CodeGen/AMDGPU/branch-uniformity.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/branch-uniformity.ll +++ llvm/test/CodeGen/AMDGPU/branch-uniformity.ll @@ -8,8 +8,8 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: ; %LOOP49 -; CHECK: v_cmp_ne_u32_e32 vcc, -; CHECK: s_cbranch_vccnz +; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; CHECK: s_cbranch_scc1 ; CHECK: ; %ENDIF53 define amdgpu_vs float @main(i32 %in) { main_body: Index: llvm/test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -21,7 +21,7 @@ ; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, v0, v0 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s0, v0 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: main: @@ -42,7 +42,7 @@ ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v0 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, s0, v0 ; VI-NEXT: ; return to shader part epilog bb: %tmp = fptosi float %arg0 to i32 Index: llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -89,7 +89,7 @@ } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 12{{$}} ; GCN: {{^}}; %bb.0: @@ -123,10 +123,9 @@ ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] -; GCN: v_cmp_ne_u32_e32 vcc, -; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cmp_lg_u32 ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN-NEXT: s_cbranch_vccnz [[LOOP]] +; GCN-NEXT: s_cbranch_scc1 [[LOOP]] ; GCN: [[END]]: Index: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -271,7 +271,7 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s10, 0xff00 +; VI-NEXT: s_movk_i32 s8, 0xff00 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -281,23 +281,23 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_movk_i32 s8, 0xff +; VI-NEXT: s_movk_i32 s9, 0xff ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s9, 0x900 +; VI-NEXT: s_movk_i32 s10, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s10, v1 +; VI-NEXT: v_and_b32_e32 v3, s8, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 -; VI-NEXT: v_and_b32_e32 v1, s8, v1 +; VI-NEXT: v_and_b32_e32 v1, s9, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; VI-NEXT: v_and_b32_e32 v2, s10, v0 +; VI-NEXT: v_and_b32_e32 v2, s8, v0 ; VI-NEXT: v_add_u16_e32 v0, 9, v0 -; VI-NEXT: v_and_b32_e32 v0, s8, v0 +; VI-NEXT: v_and_b32_e32 v0, s9, v0 ; VI-NEXT: v_or_b32_e32 v1, v3, v1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_add_u16_e32 v1, s9, v1 -; VI-NEXT: v_add_u16_e32 v0, s9, v0 +; VI-NEXT: v_add_u16_e32 v1, s10, v1 +; VI-NEXT: v_add_u16_e32 v0, s10, v0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -360,9 +360,9 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s14, 0xff00 -; VI-NEXT: s_movk_i32 s12, 0xff -; VI-NEXT: s_movk_i32 s13, 0x900 +; VI-NEXT: s_movk_i32 s12, 0xff00 +; VI-NEXT: s_movk_i32 s13, 0xff +; VI-NEXT: s_movk_i32 s14, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 @@ -378,16 +378,16 @@ ; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s14, v1 +; VI-NEXT: v_and_b32_e32 v4, s12, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s12, v1 +; VI-NEXT: v_and_b32_e32 v1, s13, v1 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 -; VI-NEXT: v_and_b32_e32 v2, s14, v0 -; VI-NEXT: v_and_b32_e32 v3, s12, v3 +; VI-NEXT: v_and_b32_e32 v2, s12, v0 +; VI-NEXT: v_and_b32_e32 v3, s13, v3 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s13, v1 -; VI-NEXT: v_add_u16_e32 v2, s13, v2 +; VI-NEXT: v_add_u16_e32 v1, s14, v1 +; VI-NEXT: v_add_u16_e32 v2, s14, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 Index: llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll +++ llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll @@ -5,10 +5,11 @@ ; in the generated code. ; CHECK-LABEL: {{^}}mov_opt: -; CHECK: v_mov_b32_e32 {{v[0-9]+}}, 1.0 +; CHECK: s_mov_b32 [[SREG:s[0-9]+]], 1.0 ; CHECK: %bb.1: ; CHECK-NOT: v_mov_b32_e32 {{v[0-9]+}}, 1.0 -; CHECK: BB0_2: +; CHECK: BB0_4: +; CHECK: v_mov_b32_e32 v{{[0-9]+}}, [[SREG]] define amdgpu_ps void @mov_opt(i32 %arg, i32 inreg %arg1, i32 inreg %arg2) local_unnamed_addr #0 { bb: Index: llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -13,49 +13,47 @@ ; CHECK: ; %bb.0: ; %start ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 m0, s0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x -; CHECK-NEXT: v_cmp_nlt_f32_e64 s[0:1], 0, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ; implicit-def: $sgpr8_sgpr9 +; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: ; implicit-def: $sgpr2_sgpr3 ; CHECK-NEXT: s_branch BB0_3 ; CHECK-NEXT: BB0_1: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] -; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: s_mov_b64 s[10:11], 0 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; CHECK-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] ; CHECK-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; CHECK-NEXT: s_and_b64 s[4:5], s[8:9], exec +; CHECK-NEXT: s_and_b64 s[4:5], s[10:11], exec ; CHECK-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; CHECK-NEXT: s_mov_b64 s[4:5], s[10:11] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_mov_b64 s[4:5], s[8:9] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_cbranch_execz BB0_6 ; CHECK-NEXT: BB0_3: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 32, v1 -; CHECK-NEXT: s_and_b64 vcc, exec, vcc ; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], exec -; CHECK-NEXT: s_cbranch_vccz BB0_2 +; CHECK-NEXT: s_cmp_lt_u32 s0, 32 +; CHECK-NEXT: s_mov_b64 s[10:11], -1 +; CHECK-NEXT: s_cbranch_scc0 BB0_2 ; CHECK-NEXT: ; %bb.4: ; %endif1 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[6:7], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc ; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; CHECK-NEXT: ; mask branch BB0_1 ; CHECK-NEXT: s_cbranch_execz BB0_1 ; CHECK-NEXT: BB0_5: ; %endif2 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: v_add_u32_e32 v1, 1, v1 +; CHECK-NEXT: s_add_i32 s0, s0, 1 ; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 ; CHECK-NEXT: s_branch BB0_1 ; CHECK-NEXT: BB0_6: ; %Flow2 -; CHECK-NEXT: s_or_b64 exec, exec, s[10:11] +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; CHECK-NEXT: ; mask branch BB0_8 Index: llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -13,9 +13,9 @@ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN: [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) - ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 - ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 - ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 + ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 + ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY1]], %subreg.sub2 ; GCN: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[REG_SEQUENCE]] ; GCN: [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF Index: llvm/test/CodeGen/AMDGPU/fabs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fabs.ll +++ llvm/test/CodeGen/AMDGPU/fabs.ll @@ -48,8 +48,8 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: v_and_b32 -; GCN: v_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -62,10 +62,10 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 -; GCN: v_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 +; GCN: s_and_b32 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -85,15 +85,15 @@ ; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -121,15 +121,15 @@ } ; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} @@ -156,15 +156,15 @@ } ; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} @@ -194,15 +194,15 @@ ; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp: ; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} -; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 ; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -231,8 +231,6 @@ } ; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp: -; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} @@ -240,9 +238,12 @@ ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 + +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} @@ -273,8 +274,6 @@ } ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: -; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000 -; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} ; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} @@ -282,9 +281,12 @@ ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 + +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] +; GCN-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} Index: llvm/test/CodeGen/AMDGPU/fmin_legacy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmin_legacy.ll +++ llvm/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -33,9 +33,13 @@ ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] +; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] -; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]] +; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] + +; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] + +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[B]], [[VA]] ; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] ; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]] Index: llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -59,7 +59,7 @@ ; GCN-LABEL: {{^}}multiple_use_fadd_fmad_f32: ; GCN-DAG: v_add_f32_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |s{{[0-9]+}}| ; SIVI-DAG: v_mad_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}} -; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], 2.0, |[[X]]|, v{{[0-9]+}} +; GFX10-DAG: v_fma_f32 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[MUL2]] ; GCN-DAG: buffer_store_dword [[MAD]] ; GCN: s_endpgm Index: llvm/test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and -; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}| +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs @@ -15,7 +15,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: ; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}| +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| ; SI-NOT: and define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) @@ -85,8 +85,8 @@ ; FIXME: In this case two uses of the constant should be folded ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -96,10 +96,10 @@ ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} -; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}} +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs Index: llvm/test/CodeGen/AMDGPU/fneg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg.ll +++ llvm/test/CodeGen/AMDGPU/fneg.ll @@ -19,8 +19,8 @@ ; R600: -PV ; GCN: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { %fneg = fsub <2 x float> , %in store <2 x float> %fneg, <2 x float> addrspace(1)* %out @@ -33,10 +33,10 @@ ; R600: -PV ; R600: -PV -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 +; GCN: s_xor_b32 define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { %fneg = fsub <4 x float> , %in store <4 x float> %fneg, <4 x float> addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/fsub.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fsub.ll +++ llvm/test/CodeGen/AMDGPU/fsub.ll @@ -27,8 +27,8 @@ ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { %sub = fsub <2 x float> %a, %b store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8 @@ -55,10 +55,10 @@ } ; FUNC-LABEL: {{^}}s_fsub_v4f32: -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} +; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; SI: s_endpgm define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) { %result = fsub <4 x float> %a, %b Index: llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -3,22 +3,22 @@ ; SI-LABEL: {{^}}i1_copy_from_loop: ; -; SI: ; %Flow -; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec -; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], exec -; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]] +; SI: [[LOOP:BB0_[0-9]+]]: ; %Flow1 +; SI: s_or_b64 exec, exec, [[EXIT_MASK:s\[[0-9]+:[0-9]+\]]] +; SI: ; %Flow +; SI: s_and_b64 [[ACCUM_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_MASK:s\[[0-9]+:[0-9]+\]]], exec +; SI: s_or_b64 [[I1_VALUE:s\[[0-9]+:[0-9]+\]]], s[6:7], [[ACCUM_MASK]] +; SI: s_cbranch_execz [[FOR_END_LABEL:BB0_[0-9]+]] ; SI: ; %for.body -; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4, -; SI-DAG: s_andn2_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec -; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec -; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]] - -; SI: ; %Flow1 -; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec - -; SI: ; %for.end -; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]] +; SI: v_cmp_lt_u32_e64 [[CC_MASK]], s{{[0-9]+}}, 4 + +; SI: [[FOR_END_LABEL]] +; SI: s_or_b64 exec, exec, [[EXIT_MASK]] +; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[I1_VALUE]] +; SI: s_cbranch_execz [[EXIT:BB0_[0-9]+]] +; SI: [[EXIT]] +; SI-NEXT: s_endpgm define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { entry: Index: llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll +++ llvm/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll @@ -7,7 +7,6 @@ ; GCN: s_cbranch_scc1 [[PREEXIT:BB[0-9_]+]] ; GCN: ; %blocka -; GCN: s_xor_b64 s[{{[0-9:]+}}], exec, -1 ; GCN: s_cmp_eq_u32 s1, 0 ; GCN: s_cbranch_scc1 [[EXIT:BB[0-9_]+]] Index: llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll +++ llvm/test/CodeGen/AMDGPU/implicit-def-muse.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=amdgcn -stop-after=amdgpu-isel -verify-machineinstrs -o - %s | FileCheck %s ; CHECK-LABEL: vcopy_i1_undef -; CHECK: [[IMPDEF0:%[0-9]+]]:vreg_1 = IMPLICIT_DEF +; CHECK: [[IMPDEF0:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NOT: COPY -; CHECK: [[IMPDEF1:%[0-9]+]]:vreg_1 = IMPLICIT_DEF +; CHECK: [[IMPDEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; CHECK-NOT: COPY [[IMPDEF0]] ; CHECK-NOT: COPY [[IMPDEF1]] ; CHECK: .false: Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -14,15 +14,14 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s8, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0x40a00000 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -30,15 +29,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s8, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0x40a00000 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 @@ -51,15 +49,14 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s8, 0x40a00000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s5, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -67,15 +64,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s8, 0x40a00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s5, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s8 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 @@ -88,15 +84,14 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s8, 0x40a00000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v2, s8 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -104,15 +99,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s8, 0x40a00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s6, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 @@ -125,15 +119,14 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; SI-NEXT: s_mov_b32 s8, 0x40a00000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s7, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -141,15 +134,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s8, 0x40a00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s8 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 @@ -201,8 +193,8 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -214,8 +206,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 @@ -232,8 +224,8 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -245,8 +237,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 @@ -293,16 +285,16 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4 -; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 -; SI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -311,16 +303,16 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 -; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 -; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b @@ -334,19 +326,19 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8 -; SI-NEXT: v_mov_b32_e32 v3, 0x40a00000 +; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 -; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 -; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -355,19 +347,19 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20 -; VI-NEXT: v_mov_b32_e32 v3, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 -; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 -; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s8 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b @@ -381,22 +373,22 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8 -; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 -; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 -; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 -; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -405,22 +397,22 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20 -; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s11 +; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 -; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s10 +; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s10 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 -; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s9 +; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -387,7 +387,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val: ; SI-NOT: v0 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, v0, v0, v0 +; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s0, s0, v0 define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -53,8 +53,8 @@ } ; GCN-LABEL: {{^}}test_fneg_fmed3_rr_0: -; GCN: s_brev_b32 [[NEG0:s[0-9]+]], 1 -; GCN: v_med3_f32 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] +; GCN: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 +; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -v{{[0-9]+}}, [[NEG0]] define amdgpu_kernel void @test_fneg_fmed3_rr_0(float addrspace(1)* %out, float %src0, float %src1) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float 0.0) %neg.med3 = fsub float -0.0, %med3 @@ -88,8 +88,8 @@ ; GCN-LABEL: {{^}}test_fneg_fmed3_r_inv2pi_0_foldable_user: ; GCN-DAG: v_bfrev_b32_e32 [[NEG0:v[0-9]+]], 1 -; GCN-DAG: s_mov_b32 [[NEG_INV:s[0-9]+]], 0xbe22f983 -; GCN: v_med3_f32 [[MED3:v[0-9]+]], -v{{[0-9]+}}, [[NEG_INV]], [[NEG0]] +; GCN-DAG: v_mov_b32_e32 [[NEG_INV:v[0-9]+]], 0xbe22f983 +; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, [[NEG_INV]], [[NEG0]] ; GCN: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[MED3]] define amdgpu_kernel void @test_fneg_fmed3_r_inv2pi_0_foldable_user(float addrspace(1)* %out, float %src0, float %mul.arg) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float 0x3FC45F3060000000, float 0.0) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll @@ -43,6 +43,7 @@ ; PREGFX10-OPT: s_mov_b32 ; PREGFX10-OPT: s_mov_b32 ; PREGFX10-NOOPT: s_waitcnt +; PREGFX10-NOOPT: v_mov_b32_e32 ; PREGFX10-NOOPT-NEXT: s_nop 0 ; VI: v_mov_b32_dpp [[VGPR0:v[0-9]+]], v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; PREGFX10-OPT: s_nop 1 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: -; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] +; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: -; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] +; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { Index: llvm/test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/loop_break.ll +++ llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -23,26 +23,27 @@ ; OPT: call void @llvm.amdgcn.end.cf.i64(i64 ; GCN-LABEL: {{^}}break_loop: -; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[ACCUM_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 -; GCN: v_cmp_lt_i32_e32 vcc, -1 -; GCN: s_and_b64 vcc, exec, vcc -; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec -; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: s_add_i32 s4, s4, 1 +; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec +; GCN: s_cmp_gt_i32 s4, -1 +; GCN: s_cbranch_scc1 [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; %bb4 ; GCN: buffer_load_dword -; GCN: v_cmp_ge_i32_e32 vcc, -; GCN: s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec -; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec -; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]] +; GCN: v_cmp_ge_i32_e32 vcc +; GCN: s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec +; GCN: s_and_b64 [[BROKEN_MASK:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[BROKEN_MASK]] ; GCN: [[FLOW]]: ; %Flow -; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]] -; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]] -; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]] -; GCN: s_andn2_b64 exec, exec, [[TMP1]] +; GCN: ; in Loop: Header=BB0_1 Depth=1 +; GCN: s_and_b64 [[BROKEN_MASK]], exec, [[INNER_MASK]] +; GCN: s_or_b64 [[BROKEN_MASK]], [[BROKEN_MASK]], [[ACCUM_MASK]] +; GCN: s_mov_b64 [[ACCUM_MASK]], [[BROKEN_MASK]] +; GCN: s_andn2_b64 exec, exec, [[BROKEN_MASK]] ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] ; GCN: ; %bb.4: ; %bb9 Index: llvm/test/CodeGen/AMDGPU/madak.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/madak.ll +++ llvm/test/CodeGen/AMDGPU/madak.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,MAD,GFX10-MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -17,6 +17,7 @@ ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -79,6 +80,7 @@ ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -106,6 +108,7 @@ ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 +; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -234,9 +237,12 @@ ; On GFX10+ we can use two scalar operands. ; GCN-LABEL: {{^}}madak_constant_bus_violation: ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} -; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] + ; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] -; MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 +; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 +; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] +; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -44,20 +44,12 @@ ; GCN: s_cbranch_execz [[FLOW2]] ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}} -; GCN: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec -; GCN: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec ; GCN: s_and_saveexec_b64 [[SAVE_EXEC]], vcc ; FIXME: duplicate comparison ; GCN: ; %ENDIF ; GCN-DAG: v_cmp_eq_u32_e32 vcc, ; GCN-DAG: v_cmp_ne_u32_e64 [[TMP51NEG:s\[[0-9]+:[0-9]+\]]], -; GCN-DAG: s_andn2_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec -; GCN-DAG: s_andn2_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec -; GCN-DAG: s_and_b64 [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec -; GCN-DAG: s_and_b64 [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec -; GCN-DAG: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]] -; GCN-DAG: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]] ; GCN: [[IF_BLOCK]]: ; %IF ; GCN-NEXT: s_endpgm @@ -90,39 +82,47 @@ ; OPT: llvm.amdgcn.end.cf ; GCN-LABEL: {{^}}multi_if_break_loop: -; GCN: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[BROKEN_THREADS_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} -; GCN: ; %Flow4 -; GCN: s_and_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK]] -; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]] -; GCN: s_andn2_b64 exec, exec, [[LEFT]] -; GCN-NEXT: s_cbranch_execz +; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %Flow4 +; GCN: s_and_b64 [[BROKEN_THREADS_MASK]], exec, [[BROKEN_THREADS_MASK]] +; GCN: s_or_b64 [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], [[SAVED:s\[[0-9]+:[0-9]+\]]] +; GCN: s_andn2_b64 exec, exec, [[BROKEN_THREADS_MASK]] +; GCN-NEXT: s_cbranch_execz [[LOOP_EXIT:BB[0-9]+_[0-9]+]] -; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}} -; GCN: s_mov_b64 [[OLD_LEFT]], [[LEFT]] +; GCN: ; %bb1{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], +; GCN: s_mov_b64 [[SAVED]], [[BROKEN_THREADS_MASK]] ; GCN: ; %LeafBlock1 -; GCN: s_mov_b64 -; GCN: s_mov_b64 [[BREAK]], -1{{$}} +; GCN: v_cmp_eq_u32_e32 vcc, 1, [[LOAD0]] +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cbranch_vccz [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; %case1 ; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], ; GCN: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD2]] -; GCN: s_orn2_b64 [[BREAK]], vcc, exec +; GCN: s_orn2_b64 [[BROKEN_THREADS_MASK]], vcc, exec +; GCN: BB1_{{[0-9]+}}: +; GCN: s_mov_b64 [[FALSE_MASK:s\[[0-9]+:[0-9]+\]]], 0 +; GCN: s_and_b64 vcc, exec, [[FALSE_MASK]] +; GCN: s_cbranch_vccz [[LOOP]] -; GCN: ; %Flow3 -; GCN: s_branch [[FLOW:BB[0-9]+_[0-9]+]] - -; GCN: s_mov_b64 [[BREAK]], -1{{$}} - -; GCN: [[FLOW]]: ; %Flow +; GCN: ; %LeafBlock +; GCN: v_cmp_eq_u32_e32 vcc, 0, [[LOAD0]] +; GCN: s_and_b64 vcc, exec, vcc +; GCN: s_cbranch_vccz [[LOOP]] ; GCN: ; %case0 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], -; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec ; GCN-DAG: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD1]] -; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec -; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]] +; GCN: s_andn2_b64 [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], exec +; GCN: s_and_b64 [[TMP_MASK:s\[[0-9]+:[0-9]+\]]], vcc, exec +; GCN: s_or_b64 [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], [[TMP_MASK]] +; GCN: s_branch [[LOOP]] + +; GCN: [[LOOP_EXIT]]: ; %Flow6 +; GCN: s_or_b64 exec, exec, [[BROKEN_THREADS_MASK]] define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { bb: Index: llvm/test/CodeGen/AMDGPU/select-opt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/select-opt.ll +++ llvm/test/CodeGen/AMDGPU/select-opt.ll @@ -135,8 +135,8 @@ ; GCN-LABEL: {{^}}regression: ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 -; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}} +; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 +; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 0 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 { entry: Index: llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -104,7 +104,8 @@ ; SI: ; %else ; SI: buffer_load_dword [[AVAL:v[0-9]+]] -; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]] +; SI: v_cmp_gt_i32_e32 vcc, 0, [[AVAL]] +; SI: s_and_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], vcc, exec ; SI: ; %if ; SI: buffer_load_dword [[AVAL:v[0-9]+]] Index: llvm/test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -3,7 +3,8 @@ ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 -; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] +; CHECK: ; %bb.1: ; %ELSE +; CHECK: s_xor_b32 s{{[0-9]}}, [[DST]] define amdgpu_ps void @phi1(<4 x i32> addrspace(4)* inreg %arg, <4 x i32> addrspace(4)* inreg %arg1, <8 x i32> addrspace(4)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(4)* %arg, i32 0 Index: llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -33,7 +33,6 @@ ; SI: ; %else ; SI: v_cmp_eq_u32_e64 [[TMP:s\[[0-9]+:[0-9]+\]]], -; SI: s_and_b64 [[PHI]], [[TMP]], exec ; SI: ; %endif Index: llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir +++ llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir @@ -16,7 +16,7 @@ body: | ; GCN-LABEL: name: phi_visit_order - ; GCN: V_ADD_I32 + ; GCN: S_ADD_I32 bb.0: liveins: $vgpr0 %7 = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/smrd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/smrd.ll +++ llvm/test/CodeGen/AMDGPU/smrd.ll @@ -573,7 +573,6 @@ ; ; TODO: we should keep the loop counter in an SGPR ; -; GCN: v_readfirstlane_b32 ; GCN: s_buffer_load_dword define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { main_body: Index: llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -1,28 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-dce-in-ra=0 -o - %s | FileCheck %s ; Don't crash when the use of an undefined value is only detected by the ; register coalescer because it is hidden with subregister insert/extract. target triple="amdgcn--" -; CHECK-LABEL: foobar: -; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc - -; CHECK: BB0_1: -; CHECK-NEXT: ; kill: def $vgpr0_vgpr1 killed $sgpr4_sgpr5 killed $exec -; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 - -; CHECK: BB0_2: -; CHECK: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { +; CHECK-LABEL: foobar: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) + +; FIXME: The change related to the fact that +; DetectDeadLanes pass hit "Copy across incompatible class" SGPR -> VGPR in analysis +; and hence it cannot derive the fact that the vector element is unused. +; Such a copies appear because the float4 vectors and their elements in the test are uniform +; but the PHI node in "ife" block is divergent because of the CF dependency (divergent branch in bb0) + +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 + +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: ; mask branch BB0_2 +; CHECK-NEXT: BB0_1: ; %ift +; CHECK-NEXT: s_mov_b32 s4, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: BB0_2: ; %ife +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm entry: %v0 = insertelement <4 x float> undef, float %a0, i32 0 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 Index: llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -7,10 +7,9 @@ ; CHECK: s_and_saveexec_b64 ; CHECK-NEXT: ; mask branch ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}} -; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader -; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: -; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]] +; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]: ; %loop_body +; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]] ; CHECK: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { Index: llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -226,13 +226,12 @@ ; GCN-LABEL: {{^}}test_s0_s1_k_f32: ; SI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI-DAG: s_load_dwordx2 s{{\[}}[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN-DAG: s_mov_b32 [[SK0:s[0-9]+]], 0x44800000 +; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]] -; GCN-DAG: v_mov_b32_e32 [[VS0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS0]], [[VS1]], [[SK0]] -; GCN-DAG: s_mov_b32 [[SK1:s[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VS0]], [[VS1]], [[SK1]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]] +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] Index: llvm/test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -165,8 +165,8 @@ ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 -; SI: s_cbranch_vccz [[LABEL_LOOP]] +; SI-DAG: s_cmpk_eq_i32 s{{[0-9+]}}, 0x100 +; SI: s_cbranch_scc0 [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm @@ -214,7 +214,7 @@ ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] -; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] +; SI: ; mask branch [[LABEL_FLOW:BB[0-9]+_[0-9]+]] ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword Index: llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -1,3 +1,4 @@ +; XFAIL: * ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s Index: llvm/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wave32.ll +++ llvm/test/CodeGen/AMDGPU/wave32.ll @@ -232,14 +232,31 @@ ; GCN: s_cbranch_execz ; GCN: BB{{.*}}: ; GCN: BB{{.*}}: -; GFX1032: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, exec_lo -; GFX1064: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], exec -; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}} -; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}] -; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] + +; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo +; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc +; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo +; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec +; GCN: global_store_dword +; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo +; GFX1064: s_and_b64 [[MASK0]], [[MASK0]], exec +; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]] +; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]] +; GCN: BB{{.*}}: ; %Flow +; GFX1032: s_and_b32 [[MASK0:s[0-9]+]], exec_lo, [[MASK1]] +; GFX1064: s_and_b64 [[MASK0:s\[[0-9:]+\]]], exec, [[MASK1]] +; GFX1032: s_or_b32 [[MASK0]], [[MASK0]], [[ACC:s[0-9]+]] +; GFX1064: s_or_b64 [[MASK0]], [[MASK0]], [[ACC:s\[[0-9:]+\]]] +; GFX1032: s_mov_b32 [[ACC]], [[MASK0]] +; GFX1064: s_mov_b64 [[ACC]], [[MASK0]] +; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[MASK0]] +; GFX1064: s_andn2_b64 exec, exec, [[MASK0]] ; GCN: s_cbranch_execz ; GCN: BB{{.*}}: +; GCN: s_load_dword [[LOAD:s[0-9]+]] +; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo +; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec +; GCN: s_cmp_lt_i32 [[LOAD]], 11 define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()