Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -384,6 +384,17 @@ } void GCNPassConfig::addPreEmitPass() { + + // The hazard recognizer that runs as part of the post-ra scheduler does not + // gaurantee to be able handle all hazards correctly. This is because + // if there are multiple scheduling regions in a basic block, the regions + // are scheduled bottom up, so when we begin to schedule a region we don't + // know what instructions were emitted directly before it. + // + // Here we add a stand-alone hazard recognizer pass which can handle all cases. + // hazard recognizer pass. + addPass(&PostRAHazardRecognizerID); + addPass(createSIInsertWaitsPass(), false); addPass(createSIShrinkInstructionsPass()); addPass(createSILowerControlFlowPass(), false); Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -47,6 +47,7 @@ AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + GCNHazardRecognizer.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp Index: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -0,0 +1,59 @@ +//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling on GCN processors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H +#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H + +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include +#include + +namespace llvm { + +class MachineFunction; +class MachineInstr; +class ScheduleDAG; +class SIInstrInfo; + +class GCNHazardRecognizer final : public ScheduleHazardRecognizer { + + // This variable stores the instruction that has been emitted this cycle. + // It will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is + // called. + MachineInstr *CurrCycleInstr; + std::list EmittedInstrs; + const MachineFunction &MF; + + int getWaitStatesSinceDef(unsigned Reg, + std::function IsHazardDef = + [](MachineInstr*) {return true;}); + + int checkSMRDHazards(MachineInstr *SMRD); + int checkVMEMHazards(MachineInstr* VMEM); +public: + GCNHazardRecognizer(const MachineFunction &MF); + // We can only issue one instruction per cycle. + bool atIssueLimit() const override { return true; } + void EmitInstruction(SUnit *SU) override; + void EmitInstruction(MachineInstr *MI) override; + HazardType getHazardType(SUnit *SU, int Stalls) override; + void EmitNoop() override; + unsigned PreEmitNoops(SUnit *SU) override; + unsigned PreEmitNoops(MachineInstr *) override; + void AdvanceCycle() override; + void RecedeCycle() override; +}; + +} // end namespace llvm + +#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H Index: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -0,0 +1,182 @@ +//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements hazard recognizers for scheduling on GCN processors. +// +//===----------------------------------------------------------------------===// + +#include "GCNHazardRecognizer.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Hazard Recoginizer Implementation +//===----------------------------------------------------------------------===// + +GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + CurrCycleInstr(nullptr), + MF(MF) { + MaxLookAhead = 5; +} + +void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { + EmitInstruction(SU->getInstr()); +} + +void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { + CurrCycleInstr = MI; +} + +ScheduleHazardRecognizer::HazardType +GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + MachineInstr *MI = SU->getInstr(); + + if (TII->isSMRD(*MI) && checkSMRDHazards(MI) > 0) + return NoopHazard; + + if (TII->isVMEM(*MI) && checkVMEMHazards(MI) > 0) + return NoopHazard; + + return NoHazard; +} + +unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { + return PreEmitNoops(SU->getInstr()); +} + +unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + + if (TII->isSMRD(*MI)) + return std::max(0, checkSMRDHazards(MI)); + + if (TII->isVMEM(*MI)) + return std::max(0, checkVMEMHazards(MI)); + + return 0; +} + +void GCNHazardRecognizer::EmitNoop() { + EmittedInstrs.push_front(nullptr); +} + +void GCNHazardRecognizer::AdvanceCycle() { + + // When the scheduler detects a stall, it will call AdvanceCycle() without + // emitting any instructions. + if (!CurrCycleInstr) + return; + + const SIInstrInfo *TII = + static_cast(MF.getSubtarget().getInstrInfo()); + unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr); + + // Keep track of emitted instructions + EmittedInstrs.push_front(CurrCycleInstr); + + // Add a nullptr for each additional wait state after the first. Make sure + // not to add more than getMaxLookAhead() items to the list, since we + // truncate the list to that size right after this loop. + for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); + i < e; ++i) { + EmittedInstrs.push_front(nullptr); + } + + // getMaxLookahead() is the largest number of wait states we will ever need + // to insert, so there is no point in keeping track of more than that many + // wait states. + EmittedInstrs.resize(getMaxLookAhead()); + + CurrCycleInstr = nullptr; +} + +void GCNHazardRecognizer::RecedeCycle() { + llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); +} + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg, + std::function IsHazardDef ) { + const TargetRegisterInfo *TRI = + MF.getSubtarget().getRegisterInfo(); + + int WaitStates = -1; + for (MachineInstr *MI : EmittedInstrs) { + ++WaitStates; + if (!MI || !IsHazardDef(MI)) + continue; + if (MI->modifiesRegister(Reg, TRI)) + return WaitStates; + } + return std::numeric_limits::max(); +} + +//===----------------------------------------------------------------------===// +// No-op Hazard Detection +//===----------------------------------------------------------------------===// + +int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); + + // This SMRD hazard only affects SI. + if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 0; + + // A read of an SGPR by SMRD instruction requires 4 wait states when the + // SGPR was written by a VALU instruction. + int SmrdSgprWaitStates = 4; + int WaitStatesNeeded = 0; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + + for (const MachineOperand &Use : SMRD->uses()) { + if (!Use.isReg()) + continue; + int WaitStatesNeededForUse = + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); + + if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 0; + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + // A read of an SGPR by a VMEM instruction requires 5 wait states when the + // SGPR was written by a VALU Instruction. + int VmemSgprWaitStates = 5; + int WaitStatesNeeded = 0; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + + for (const MachineOperand &Use : VMEM->uses()) { + if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + + int WaitStatesNeededForUse = + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + return WaitStatesNeeded; +} Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -169,6 +169,14 @@ return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isVMEM(const MachineInstr &MI) { + return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI); + } + + bool isVMEM(uint16_t Opcode) const { + return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode); + } + static bool isSOP1(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SOP1; } @@ -440,6 +448,12 @@ void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI, int Count) const; + void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const; + + /// \brief Return the number of wait states that result from executing this + /// instruction. + unsigned getNumWaitStates(const MachineInstr &MI) const; + /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. LLVM_READONLY @@ -472,6 +486,13 @@ ArrayRef> getSerializableTargetIndices() const override; + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const override; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; + }; namespace AMDGPU { Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -15,11 +15,13 @@ #include "SIInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" @@ -816,6 +818,20 @@ } } +void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + insertWaitStates(MBB, MI, 1); +} + +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: return 1; // FIXME: Do wait states equal cycles? + + case AMDGPU::S_NOP: + return MI.getOperand(0).getImm() + 1; + } +} + bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MBB.findDebugLoc(MI); @@ -1188,8 +1204,11 @@ if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { - assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && - "read2 / write2 not expected here yet"); + + if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) { + // FIXME: Handle ds_read2 / ds_write2. + return false; + } unsigned Width0 = (*MIa->memoperands_begin())->getSize(); unsigned Width1 = (*MIb->memoperands_begin())->getSize(); if (BaseReg0 == BaseReg1 && @@ -2964,3 +2983,18 @@ {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; return makeArrayRef(TargetIndices); } + +/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The +/// post-RA version of misched uses CreateTargetMIHazardRecognizer. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + return new GCNHazardRecognizer(DAG->MF); +} + +/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer +/// pass. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { + return new GCNHazardRecognizer(MF); +} Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h @@ -188,6 +188,8 @@ unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -596,22 +596,6 @@ } } - // TODO: only do this when it is needed - switch (MF->getSubtarget().getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states - // ("S_NOP 3") on SI - TII->insertWaitStates(*MBB, MI, 4); - break; - case AMDGPUSubtarget::SEA_ISLANDS: - break; - default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states - // ("S_NOP 4") on VI and later. This also applies to VALUs which write - // VCC, but we're unlikely to see VMEM use VCC. - TII->insertWaitStates(*MBB, MI, 5); - } - MI->eraseFromParent(); break; } @@ -991,3 +975,14 @@ } } } + +bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass *RC; + if (TargetRegisterInfo::isVirtualRegister(Reg)) + RC = MRI.getRegClass(Reg); + else + RC = getPhysRegClass(Reg); + + return hasVGPRs(RC); +} Index: llvm/trunk/lib/Target/AMDGPU/SISchedule.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SISchedule.td +++ llvm/trunk/lib/Target/AMDGPU/SISchedule.td @@ -42,6 +42,7 @@ class SISchedMachineModel : SchedMachineModel { let CompleteModel = 0; let IssueWidth = 1; + let PostRAScheduler = 1; } def SIFullSpeedModel : SISchedMachineModel; Index: llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll +++ llvm/trunk/test/CodeGen/AMDGPU/addrspacecast.ll @@ -155,8 +155,8 @@ } ; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: -; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]] define void @cast_neg1_group_to_flat_addrspacecast() #0 { @@ -226,8 +226,8 @@ ; Check for prologue initializing special SGPRs pointing to scratch. ; HSA-LABEL: {{^}}store_flat_scratch: -; HSA: s_mov_b32 flat_scratch_lo, s9 -; HSA: s_add_u32 [[ADD:s[0-9]+]], s8, s11 +; HSA-DAG: s_mov_b32 flat_scratch_lo, s9 +; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11 ; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 ; HSA: flat_store_dword ; HSA: s_barrier Index: llvm/trunk/test/CodeGen/AMDGPU/and.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/and.ll +++ llvm/trunk/test/CodeGen/AMDGPU/and.ll @@ -212,10 +212,10 @@ } ; FUNC-LABEL: {{^}}s_and_multi_use_inline_imm_i64: -; SI: s_load_dwordx2 ; SI: s_load_dword [[A:s[0-9]+]] ; SI: s_load_dword [[B:s[0-9]+]] ; SI: s_load_dwordx2 +; SI: s_load_dwordx2 ; SI-NOT: and ; SI: s_lshl_b32 [[A]], [[A]], 1 ; SI: s_lshl_b32 [[B]], [[B]], 1 Index: llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll +++ llvm/trunk/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -2,9 +2,9 @@ ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] +; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 { @@ -140,16 +140,16 @@ } ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: +; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN-DAG: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc -; GCN-DAG: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]] +; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]] +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56 ; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN-DAG: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56 -; GCN-DAG: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]] +; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]] ; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { %tmp0 = alloca [4096 x i32] Index: llvm/trunk/test/CodeGen/AMDGPU/fract.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fract.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fract.f64.ll @@ -6,11 +6,11 @@ declare double @llvm.floor.f64(double) #0 ; FUNC-LABEL: {{^}}fract_f64: -; GCN: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; GCN-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] +; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 ; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]] ; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] @@ -24,11 +24,11 @@ } ; FUNC-LABEL: {{^}}fract_f64_neg: -; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; GCN-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] +; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 ; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]] ; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] @@ -43,11 +43,11 @@ } ; FUNC-LABEL: {{^}}fract_f64_neg_abs: -; GCN: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| -; SI: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 -; SI: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] -; SI: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 +; GCN-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| +; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 +; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff +; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI-DAG: v_cmp_class_f64_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO]]:[[HI]]], 3 ; SI: v_cndmask_b32_e64 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], [[COND]] ; SI: v_cndmask_b32_e64 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], [[COND]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]] Index: llvm/trunk/test/CodeGen/AMDGPU/half.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/half.ll +++ llvm/trunk/test/CodeGen/AMDGPU/half.ll @@ -396,10 +396,10 @@ ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: v_cvt_f32_f16_e32 -; GCN: v_cvt_f32_f16_e32 +; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} ; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 ; GCN-NOT: v_cvt_f32_f16 ; GCN: v_cvt_f64_f32_e32 Index: llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -208,10 +208,10 @@ ; SI-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} ; SI-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0{{$}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; SI: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; SI: s_mov_b32 m0, [[SCALEDIDX]] ; SI: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT0]] Index: llvm/trunk/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll +++ llvm/trunk/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -7,8 +7,8 @@ ; from constant/invariant memory. ; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load: -; GCN: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b +; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b ; GCN: buffer_store_dword [[K]], [[PTR]] define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0 Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -12,8 +12,8 @@ ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll @@ -10,10 +10,10 @@ ; FUNC-LABEL: {{^}}rsq_clamped_f32: ; SI: v_rsq_clamp_f32_e32 -; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} -; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] +; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} +; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] ; TODO: this constant should be folded: -; VI: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff +; VI-DAG: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff ; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]] ; EG: RECIPSQRT_CLAMPED Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -3,6 +3,7 @@ ;CHECK-LABEL: {{^}}test1: ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) @@ -12,7 +13,6 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} @@ -70,6 +70,7 @@ ;CHECK-LABEL: {{^}}test3: ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc @@ -78,7 +79,6 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -8,10 +8,10 @@ ; SI: v_rsq_clamp_f32_e32 ; VI: s_load_dword [[SRC:s[0-9]+]] -; VI: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]] -; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] +; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], [[SRC]] +; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] ; TODO: this constant should be folded: -; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff +; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xff7fffff ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[K]] ; VI: buffer_store_dword [[RESULT]] define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { @@ -30,8 +30,8 @@ ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} -; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] +; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -13,7 +13,7 @@ ; FUNC-LABEL: {{^}}v_round_f64: ; SI: buffer_load_dwordx2 -; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 +; SI-DAG: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 ; SI-DAG: v_not_b32_e32 ; SI-DAG: v_not_b32_e32 Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.round.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.round.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.round.ll @@ -5,9 +5,9 @@ ; FUNC-LABEL: {{^}}round_f32: ; SI-DAG: s_load_dword [[SX:s[0-9]+]] ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff -; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] -; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] -; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; SI-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] +; SI-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] ; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] ; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]| ; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]] Index: llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll +++ llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll @@ -491,8 +491,8 @@ ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] @@ -538,8 +538,8 @@ ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] -; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]] +; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]] Index: llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll +++ llvm/trunk/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -7,10 +7,10 @@ ; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: ; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] ; GCN-NEXT: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] ; GCN-NOT: v_mov_b32 Index: llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll +++ llvm/trunk/test/CodeGen/AMDGPU/setcc-opt.ll @@ -199,10 +199,10 @@ ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff -; GCN: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] -; GCN: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] +; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] +; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] ; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK255]] -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind { @@ -247,10 +247,10 @@ ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff -; GCN: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] -; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] +; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] ; GCN: v_cmp_ne_i32_e32 vcc, [[B]], [[VK]]{{$}} -; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc +; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_byte [[RESULT]] ; GCN: s_endpgm define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind { Index: llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -4,8 +4,8 @@ ; GCN-LABEL: {{^}}v_uextract_bit_31_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] @@ -27,9 +27,9 @@ ; GCN-LABEL: {{^}}v_uextract_bit_63_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO1]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} @@ -50,9 +50,9 @@ ; GCN-LABEL: {{^}}v_uextract_bit_95_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO1]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} @@ -73,9 +73,9 @@ ; GCN-LABEL: {{^}}v_uextract_bit_127_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO1]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -300,9 +300,9 @@ ; GCN-LABEL: {{^}}and_not_mask_i64: ; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} +; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}} ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] ; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]] -; GCN-DAG: v_mov_b32_e32 v[[SHRHI]], 0{{$}} ; GCN-NOT: v[[SHRLO]] ; GCN-NOT: v[[SHRHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} Index: llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -4,9 +4,9 @@ ; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0 ; GCN-LABEL: {{^}}lshr_i64_35: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in @@ -16,9 +16,9 @@ } ; GCN-LABEL: {{^}}lshr_i64_63: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in @@ -28,9 +28,9 @@ } ; GCN-LABEL: {{^}}lshr_i64_33: -; GCN: buffer_load_dword [[VAL:v[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]] -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -2,6 +2,12 @@ ; Make sure this doesn't crash. ; CHECK: {{^}}test: +; Make sure we are handling hazards correctly. +; CHECK: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]] +; CHECK-NEXT: s_nop 4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:[[HI]]{{\]}}, 0 ; CHECK: s_endpgm define void @test(i32 addrspace(1)* %out, i32 %in) { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () Index: llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/trunk/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -156,11 +156,9 @@ } ; FUNC-LABEL: @reorder_local_offsets -; FIXME: The scheduler doesn't think its proftible to re-order the -; loads and stores, and I'm not sure that it really is. -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 ; CI: buffer_store_dword @@ -185,8 +183,8 @@ ; FUNC-LABEL: @reorder_global_offsets ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 -; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 +; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 Index: llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ llvm/trunk/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -7,9 +7,9 @@ ; CHECK: s_load_dword s2, s[0:1], 0x9 ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 +; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0 -; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; BB0_1: ; CHECK: s_load_dword s0, s[0:1], 0xa Index: llvm/trunk/test/CodeGen/AMDGPU/trunc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/trunc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/trunc.ll @@ -37,8 +37,8 @@ ; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]], ; SI: s_addc_u32 ; SI: v_mov_b32_e32 -; SI: v_mov_b32_e32 ; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]] +; SI: v_mov_b32_e32 ; SI: buffer_store_dword v[[LO_VREG]], define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) { %aa = add i64 %a, 234 ; Prevent shrinking store. Index: llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll +++ llvm/trunk/test/CodeGen/AMDGPU/uniform-cfg.ll @@ -31,9 +31,9 @@ ; SI-LABEL: {{^}}uniform_if_vcc: ; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and ; also scheduled the write first. -; SI: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} -; SI: s_and_b64 vcc, exec, [[COND]] -; SI: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; SI-DAG: v_cmp_eq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI-DAG: s_and_b64 vcc, exec, [[COND]] +; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else @@ -88,9 +88,9 @@ ; SI-LABEL: {{^}}uniform_if_swap_br_targets_vcc: ; FIXME: We could use _e32 here if we re-used the 0 from [[STORE_VAL]], and ; also scheduled the write first. -; SI: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} -; SI: s_and_b64 vcc, exec, [[COND]] -; SI: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 +; SI-DAG: v_cmp_neq_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 0, s{{[0-9]+}} +; SI-DAG: s_and_b64 vcc, exec, [[COND]] +; SI-DAG: v_mov_b32_e32 [[STORE_VAL:v[0-9]+]], 0 ; SI: s_cbranch_vccnz [[IF_LABEL:[0-9_A-Za-z]+]] ; Fall-through to the else Index: llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -19,12 +19,12 @@ ; GCN-NOT: flat_scr -; GCNMESA: s_mov_b32 s16, s3 -; GCNMESA: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCNMESA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCNMESA-NEXT: s_mov_b32 s14, -1 -; SIMESA-NEXT: s_mov_b32 s15, 0x98f000 -; VIMESA-NEXT: s_mov_b32 s15, 0x980000 +; GCNMESA-DAG: s_mov_b32 s16, s3 +; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCNMESA-DAG: s_mov_b32 s14, -1 +; SIMESA-DAG: s_mov_b32 s15, 0x98f000 +; VIMESA-DAG: s_mov_b32 s15, 0x980000 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill Index: llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ llvm/trunk/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,12 +11,11 @@ ; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 s11, s12 -; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x98f000 -; VI-NEXT: s_mov_b32 s15, 0x980000 +; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s14, -1 +; SI-DAG: s_mov_b32 s15, 0x98f000 +; VI-DAG: s_mov_b32 s15, 0x980000 ; s12 is offset user SGPR ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill