diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h --- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -268,6 +268,11 @@ return SU->SchedClass; } + /// IsReachable - Checks if SU is reachable from TargetSU. + bool IsReachable(SUnit *SU, SUnit *TargetSU) { + return Topo.IsReachable(SU, TargetSU); + } + /// Returns an iterator to the top of the current scheduling region. MachineBasicBlock::iterator begin() const { return RegionBegin; } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1530,7 +1530,10 @@ void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: - void clusterNeighboringMemOps(ArrayRef MemOps, ScheduleDAGInstrs *DAG); + void clusterNeighboringMemOps(SmallVector &MemOps, + ScheduleDAGInstrs *DAG); + void collectMemOpRecords(std::vector &SUnits, + SmallVector &MemOpRecords); }; class StoreClusterMutation : public BaseMemOpClusterMutation { @@ -1566,63 +1569,60 @@ } // end namespace llvm +// Sorting all the loads/stores first, then for each load/store, checking the +// following load/store one by one, until reach the first non-dependent one and +// call target hook to see if they can cluser. void BaseMemOpClusterMutation::clusterNeighboringMemOps( - ArrayRef MemOps, ScheduleDAGInstrs *DAG) { - SmallVector MemOpRecords; - for (SUnit *SU : MemOps) { - const MachineInstr &MI = *SU->getInstr(); - SmallVector BaseOps; - int64_t Offset; - bool OffsetIsScalable; - unsigned Width; - if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, - OffsetIsScalable, Width, TRI)) { - MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width)); - - LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " - << Offset << ", OffsetIsScalable: " << OffsetIsScalable - << ", Width: " << Width << "\n"); - } -#ifndef NDEBUG - for (auto *Op : BaseOps) - assert(Op); -#endif - } + SmallVector &MemOpRecords, ScheduleDAGInstrs *DAG) { if (MemOpRecords.size() < 2) return; + // Sorting the loads/stores, so that, we can stop the cluster as early as + // possible. llvm::sort(MemOpRecords); + // Keep track of the current cluster length and bytes for each SUnit. + DenseMap> SUnit2ClusterInfo; + // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to // cluster mem ops collected within `MemOpRecords` array. - unsigned ClusterLength = 1; - unsigned CurrentClusterBytes = MemOpRecords[0].Width; for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { // Decision to cluster mem ops is taken based on target dependent logic auto MemOpa = MemOpRecords[Idx]; - auto MemOpb = MemOpRecords[Idx + 1]; - ++ClusterLength; - CurrentClusterBytes += MemOpb.Width; - if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, - CurrentClusterBytes)) { - // Current mem ops pair could not be clustered, reset cluster length, and - // go to next pair - ClusterLength = 1; - CurrentClusterBytes = MemOpb.Width; + + // Seak for the next load/store to do the cluster. + unsigned NextIdx = Idx + 1; + for (; NextIdx < End; ++NextIdx) + // Skip if MemOpb has been clustered already or has dependency with + // MemOpa. + if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) && + !DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) && + !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU)) + break; + if (NextIdx == End) continue; + + auto MemOpb = MemOpRecords[NextIdx]; + unsigned ClusterLength = 2; + unsigned CurrentClusterBytes = MemOpb.Width; + if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) { + ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1; + CurrentClusterBytes = + SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width; } + if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength, + CurrentClusterBytes)) + continue; + SUnit *SUa = MemOpa.SU; SUnit *SUb = MemOpb.SU; if (SUa->NodeNum > SUb->NodeNum) std::swap(SUa, SUb); // FIXME: Is this check really required? - if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - ClusterLength = 1; - CurrentClusterBytes = MemOpb.Width; + if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) continue; - } LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); @@ -1656,42 +1656,50 @@ } } + SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength, + CurrentClusterBytes}; + LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength << ", Curr cluster bytes: " << CurrentClusterBytes << "\n"); } } -/// Callback from DAG postProcessing to create cluster edges for loads. -void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { - // Map DAG NodeNum to a set of dependent MemOps in store chain. - DenseMap> StoreChains; - for (SUnit &SU : DAG->SUnits) { +void BaseMemOpClusterMutation::collectMemOpRecords( + std::vector &SUnits, SmallVector &MemOpRecords) { + for (auto &SU : SUnits) { if ((IsLoad && !SU.getInstr()->mayLoad()) || (!IsLoad && !SU.getInstr()->mayStore())) continue; - unsigned ChainPredID = DAG->SUnits.size(); - for (const SDep &Pred : SU.Preds) { - // We only want to cluster the mem ops that have the same ctrl(non-data) - // pred so that they didn't have ctrl dependency for each other. But for - // store instrs, we can still cluster them if the pred is load instr. - if ((Pred.isCtrl() && - (IsLoad || - (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) && - !Pred.isArtificial()) { - ChainPredID = Pred.getSUnit()->NodeNum; - break; - } + const MachineInstr &MI = *SU.getInstr(); + SmallVector BaseOps; + int64_t Offset; + bool OffsetIsScalable; + unsigned Width; + if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, + OffsetIsScalable, Width, TRI)) { + MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width)); + + LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: " + << Offset << ", OffsetIsScalable: " << OffsetIsScalable + << ", Width: " << Width << "\n"); } - // Insert the SU to corresponding store chain. - auto &Chain = StoreChains.FindAndConstruct(ChainPredID).second; - Chain.push_back(&SU); +#ifndef NDEBUG + for (auto *Op : BaseOps) + assert(Op); +#endif } +} + +/// Callback from DAG postProcessing to create cluster edges for loads/stores. +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) { + // Collect all the clusterable loads/stores + SmallVector MemOpRecords; + collectMemOpRecords(DAG->SUnits, MemOpRecords); - // Iterate over the store chains. - for (auto &SCD : StoreChains) - clusterNeighboringMemOps(SCD.second, DAG); + // Trying to cluster all the neighboring loads/stores. + clusterNeighboringMemOps(MemOpRecords, DAG); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -527,41 +527,41 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { ; GFX7-LABEL: test_div_fmas_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: s_and_b32 s2, 1, s8 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: s_and_b32 s0, 1, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_nop 3 ; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_and_b32 s2, 1, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -624,12 +624,12 @@ ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} - ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} + ; FIXME: Why this reload? ; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} @@ -669,8 +669,8 @@ ; FIXED-ABI-NOT: v31 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} +; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -162,67 +162,64 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s15 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s19, s11 -; SI-NEXT: s_mov_b32 s22, s10 -; SI-NEXT: s_mov_b32 s23, s11 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 -; SI-NEXT: s_mov_b32 s16, s4 -; SI-NEXT: s_mov_b32 s17, s5 -; SI-NEXT: s_mov_b32 s20, s6 -; SI-NEXT: s_mov_b32 s21, s7 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s2, s14 +; SI-NEXT: s_mov_b32 s18, s14 +; SI-NEXT: s_mov_b32 s19, s15 +; SI-NEXT: s_mov_b32 s22, s14 +; SI-NEXT: s_mov_b32 s23, s15 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s16, s8 +; SI-NEXT: s_mov_b32 s17, s9 +; SI-NEXT: s_mov_b32 s20, s10 +; SI-NEXT: s_mov_b32 s21, s11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s14, s2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s18, s10 -; VI-NEXT: s_mov_b32 s19, s11 -; VI-NEXT: s_mov_b32 s22, s10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: s_mov_b32 s23, s11 -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s5 -; VI-NEXT: s_mov_b32 s20, s6 -; VI-NEXT: s_mov_b32 s21, s7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s16, s8 +; VI-NEXT: s_mov_b32 s17, s9 +; VI-NEXT: s_mov_b32 s20, s10 +; VI-NEXT: s_mov_b32 s21, s11 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -747,17 +747,16 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v9, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v8 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_or_b32_e32 v2, v9, v6 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v8 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 +; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -46,8 +46,8 @@ ; Test various offset boundaries. ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 %load11 = load i64, i64 addrspace(1)* %gep11 %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -237,15 +237,15 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 ; SI-NEXT: s_brev_b32 s20, 1 ; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] @@ -346,47 +346,47 @@ ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 -; CI-NEXT: s_brev_b32 s12, -2 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] -; CI-NEXT: v_mov_b32_e32 v4, s7 -; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] +; CI-NEXT: v_mov_b32_e32 v4, s11 +; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v4, s5 +; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v4, s9 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 +; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] -; CI-NEXT: v_mov_b32_e32 v10, s11 -; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] -; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[14:15] +; CI-NEXT: v_mov_b32_e32 v10, s15 +; CI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] +; CI-NEXT: v_bfi_b32 v10, s2, v12, v10 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v6, 0 ; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] ; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v13, s9 +; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v13, s13 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 +; CI-NEXT: v_bfi_b32 v12, s2, v12, v13 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] -; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,26 +11,26 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 BB0_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s0, s11, s0 +; SI-NEXT: s_add_i32 s2, s7, s2 ; SI-NEXT: s_cbranch_execz BB0_3 ; SI-NEXT: s_branch BB0_4 ; SI-NEXT: BB0_2: -; SI-NEXT: ; implicit-def: $sgpr0 +; SI-NEXT: ; implicit-def: $sgpr2 ; SI-NEXT: BB0_3: ; %if -; SI-NEXT: s_sub_i32 s0, s9, s10 +; SI-NEXT: s_sub_i32 s2, s5, s6 ; SI-NEXT: BB0_4: ; %endif -; SI-NEXT: s_add_i32 s0, s0, s8 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_add_i32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -446,11 +446,12 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 @@ -503,7 +504,6 @@ ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -516,11 +516,12 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 @@ -573,7 +574,6 @@ ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -586,8 +586,8 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -160,16 +160,14 @@ ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3 -; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 +; GCN: s_mov_b32 s34, s32 +; GCN: v_mov_b32_e32 v32, 0 +; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000 -; GCN: v_mov_b32_e32 v33, 0 - -; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024 - ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]