diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1624,16 +1624,32 @@ LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); - // Copy successor edges from SUa to SUb. Interleaving computation - // dependent on SUa can prevent load combining due to register reuse. - // Predecessor edges do not need to be copied from SUb to SUa since - // nearby loads should have effectively the same inputs. - for (const SDep &Succ : SUa->Succs) { - if (Succ.getSUnit() == SUb) - continue; - LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum - << ")\n"); - DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + if (IsLoad) { + // Copy successor edges from SUa to SUb. Interleaving computation + // dependent on SUa can prevent load combining due to register reuse. + // Predecessor edges do not need to be copied from SUb to SUa since + // nearby loads should have effectively the same inputs. + for (const SDep &Succ : SUa->Succs) { + if (Succ.getSUnit() == SUb) + continue; + LLVM_DEBUG(dbgs() << " Copy Succ SU(" << Succ.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial)); + } + } else { + // Copy predecessor edges from SUb to SUa to avoid the SUnits that + // SUb dependent on scheduled in-between SUb and SUa. Successor edges + // do not need to be copied from SUa to SUb since no one will depend + // on stores. + // Notice that, we don't need to care about the memory dependency as + // we won't try to cluster them if they have any memory dependency. + for (const SDep &Pred : SUb->Preds) { + if (Pred.getSUnit() == SUa) + continue; + LLVM_DEBUG(dbgs() << " Copy Pred SU(" << Pred.getSUnit()->NodeNum + << ")\n"); + DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial)); + } } LLVM_DEBUG(dbgs() << " Curr cluster length: " << ClusterLength diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -194,3 +194,22 @@ store i64 %add6.3, i64* %arrayidx5.3, align 8 ret void } + +; Verify that the SU(2) and SU(4) are the preds of SU(3) +; CHECK: ********** MI Scheduling ********** +; CHECK-LABEL: stp_missing_preds_edges:%bb.0 +; CHECK:Cluster ld/st SU(3) - SU(5) +; CHECK: Copy Pred SU(4) +; CHECK: Copy Pred SU(2) +; CHECK:SU(2): %0:gpr64common = COPY $x0 +; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 0 +; CHECK:SU(4): %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0 +; CHECK:SU(5): STRWui %3:gpr32common, %0:gpr64common, 1 +define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) { +entry: + store i32 %m, i32* %p, align 4 + %add = add nsw i32 %n, 5 + %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1 + store i32 %add, i32* %arrayidx1, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -40,6 +40,7 @@ ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc @@ -55,212 +56,214 @@ ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 +; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 20, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 24, v0 +; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 28, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 28, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 32, v0 +; GCN-NEXT: v_add_u32_e32 v2, 36, v0 ; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 36, v0 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 +; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 44, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 48, v0 +; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 52, v0 ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 52, v0 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 +; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 60, v0 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 60, v0 -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 +; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 +; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 -; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 +; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 ; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 -; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 +; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 +; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 -; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 +; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 +; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 +; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 +; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 ; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 ; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 +; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 +; GCN-NEXT: v_mov_b32_e32 v11, v18 +; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 +; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 ; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 +; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 8, v0 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 12, v0 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 ; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 +; GCN-NEXT: v_add_u32_e32 v7, 8, v0 +; GCN-NEXT: v_add_u32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: v_add_u32_e32 v2, 0xd4, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0xdc, v0 ; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v4, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0xe4, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0xec, v0 +; GCN-NEXT: v_add_u32_e32 v5, 0xf0, v0 +; GCN-NEXT: v_add_u32_e32 v6, 0xf4, v0 +; GCN-NEXT: v_add_u32_e32 v7, 0xf8, v0 +; GCN-NEXT: v_add_u32_e32 v8, 0xfc, v0 ; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 63, v2 +; GCN-NEXT: buffer_store_dword v56, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v60, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v62, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 63, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen @@ -323,6 +326,7 @@ ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc @@ -338,215 +342,217 @@ ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 +; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 20, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 24, v0 +; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 28, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 28, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 32, v0 +; GCN-NEXT: v_add_u32_e32 v2, 36, v0 ; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 36, v0 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 +; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 44, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 48, v0 +; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 52, v0 ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 52, v0 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 +; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 60, v0 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 60, v0 -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 +; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 +; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 -; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 +; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 ; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 -; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 +; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 +; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 -; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 +; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 +; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 +; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 +; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 ; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 ; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 +; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 +; GCN-NEXT: v_mov_b32_e32 v11, v18 +; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 +; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 ; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 +; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 4, v0 +; GCN-NEXT: v_add_u32_e32 v7, 8, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 8, v0 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 12, v0 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 12, v0 +; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 63, v1 -; GCN-NEXT: v_add_u32_e32 v3, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v51, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v0 +; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xe4, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0xe8, v0 +; GCN-NEXT: v_add_u32_e32 v5, 0xec, v0 +; GCN-NEXT: v_add_u32_e32 v6, 0xf0, v0 +; GCN-NEXT: v_add_u32_e32 v7, 0xf4, v0 +; GCN-NEXT: v_add_u32_e32 v8, 0xf8, v0 +; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 +; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v60, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v10 +; GCN-NEXT: v_and_b32_e32 v1, 63, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GCN-NEXT: v_add_u32_e32 v4, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v10, 0xfc, v0 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v10, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -563,7 +569,7 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v1, 1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 1, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(15) @@ -579,22 +585,9 @@ ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 -; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -610,8 +603,41 @@ ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v15, v0 +; GCN-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[0:3], v[15:16], off +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 +; GCN-NEXT: s_movk_i32 s4, 0x80 +; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc +; GCN-NEXT: s_add_u32 s32, s32, 0x10000 +; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 +; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc ; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 @@ -623,215 +649,198 @@ ; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off ; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[59:60], off ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 -; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 +; GCN-NEXT: v_add_u32_e32 v2, 24, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 20, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 28, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 36, v0 -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: v_add_u32_e32 v7, 28, v0 +; GCN-NEXT: v_add_u32_e32 v9, 36, v0 +; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 +; GCN-NEXT: v_add_u32_e32 v3, 32, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 48, v0 ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 +; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v4, 52, v0 +; GCN-NEXT: v_add_u32_e32 v5, 60, v0 +; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 52, v0 -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 60, v0 -; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x48, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 +; GCN-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 -; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 -; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 -; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x58, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0x54, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0x5c, v0 +; GCN-NEXT: v_add_u32_e32 v5, 0x64, v0 +; GCN-NEXT: v_add_u32_e32 v6, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v6, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x68, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 +; GCN-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v7, 0x74, v0 +; GCN-NEXT: v_add_u32_e32 v8, 0x7c, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x78, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 -; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v8, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x88, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 +; GCN-NEXT: buffer_store_dword v37, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 -; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 -; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0x98, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v10, v17 -; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0x94, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0x9c, v0 +; GCN-NEXT: v_add_u32_e32 v5, 0xa4, v0 +; GCN-NEXT: v_add_u32_e32 v6, 0xac, v0 +; GCN-NEXT: buffer_store_dword v41, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v6, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xa8, v0 +; GCN-NEXT: buffer_store_dword v43, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 +; GCN-NEXT: buffer_store_dword v45, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v7, 0xb4, v0 +; GCN-NEXT: v_add_u32_e32 v8, 0xbc, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0xb8, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 -; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 -; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v49, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v2, 0xc8, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 8, v0 -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 12, v0 -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 +; GCN-NEXT: v_add_u32_e32 v7, 0xec, v0 +; GCN-NEXT: v_add_u32_e32 v8, 0xf4, v0 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NEXT: v_add_u32_e32 v3, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v2, 8, v0 ; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v4, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xe0, v0 -; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xf8, v0 +; GCN-NEXT: v_add_u32_e32 v3, 12, v0 +; GCN-NEXT: v_add_u32_e32 v4, 0xd4, v0 ; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v10, 0xf0, v0 -; GCN-NEXT: buffer_store_dword v55, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v10, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v4, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v9, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v6, 0xe4, v0 +; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 4, v0 +; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:256 +; GCN-NEXT: v_add_u32_e32 v2, 0xd8, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 +; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 +; GCN-NEXT: buffer_store_dword v53, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v2, 0xf0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v4, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 31, v2 +; GCN-NEXT: buffer_store_dword v56, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v60, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 31, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: v_add_u32_e32 v1, 4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -25,15 +25,15 @@ ; GCN-NEXT: v_mov_b32_e32 v6, s15 ; GCN-NEXT: v_mov_b32_e32 v8, s16 ; GCN-NEXT: v_mov_b32_e32 v10, s17 +; GCN-NEXT: v_mov_b32_e32 v12, s18 +; GCN-NEXT: v_mov_b32_e32 v14, s19 ; GCN-NEXT: s_movk_i32 s5, 0x60 ; GCN-NEXT: v_add_u32_e32 v2, 8, v0 ; GCN-NEXT: v_add_u32_e32 v3, 12, v0 ; GCN-NEXT: v_add_u32_e32 v7, 16, v0 ; GCN-NEXT: v_add_u32_e32 v9, 20, v0 ; GCN-NEXT: v_add_u32_e32 v11, 24, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s18 ; GCN-NEXT: v_add_u32_e32 v13, 28, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s19 ; GCN-NEXT: v_add_u32_e32 v15, 32, v0 ; GCN-NEXT: v_mov_b32_e32 v16, s20 ; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen @@ -71,7 +71,7 @@ ; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s10, 0x70 +; GCN-NEXT: s_movk_i32 s13, 0x70 ; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0 ; GCN-NEXT: v_mov_b32_e32 v36, s70 ; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0 @@ -96,19 +96,19 @@ ; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0 ; GCN-NEXT: v_mov_b32_e32 v14, s77 ; GCN-NEXT: v_mov_b32_e32 v4, s81 -; GCN-NEXT: s_movk_i32 s11, 0x90 -; GCN-NEXT: s_movk_i32 s13, 0xa0 +; GCN-NEXT: s_movk_i32 s14, 0x90 +; GCN-NEXT: s_movk_i32 s15, 0xa0 ; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0 ; GCN-NEXT: v_mov_b32_e32 v16, s78 ; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0 ; GCN-NEXT: v_mov_b32_e32 v18, s79 +; GCN-NEXT: v_add_u32_e32 v32, s13, v0 ; GCN-NEXT: v_mov_b32_e32 v20, s80 -; GCN-NEXT: v_mov_b32_e32 v5, s82 -; GCN-NEXT: v_mov_b32_e32 v6, s83 -; GCN-NEXT: v_add_u32_e32 v32, s10, v0 ; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0 ; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0 +; GCN-NEXT: v_mov_b32_e32 v5, s82 ; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0 +; GCN-NEXT: v_mov_b32_e32 v6, s83 ; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0 ; GCN-NEXT: v_mov_b32_e32 v8, s52 ; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen @@ -121,12 +121,12 @@ ; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0 ; GCN-NEXT: v_mov_b32_e32 v4, s53 -; GCN-NEXT: s_movk_i32 s14, 0xb0 +; GCN-NEXT: s_movk_i32 s16, 0xb0 ; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s54 ; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0 ; GCN-NEXT: v_mov_b32_e32 v6, s55 -; GCN-NEXT: v_add_u32_e32 v48, s11, v0 +; GCN-NEXT: v_add_u32_e32 v48, s14, v0 ; GCN-NEXT: v_mov_b32_e32 v8, s56 ; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0 ; GCN-NEXT: v_mov_b32_e32 v10, s57 @@ -134,7 +134,7 @@ ; GCN-NEXT: v_mov_b32_e32 v12, s58 ; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0 ; GCN-NEXT: v_mov_b32_e32 v14, s59 -; GCN-NEXT: v_add_u32_e32 v52, s13, v0 +; GCN-NEXT: v_add_u32_e32 v52, s15, v0 ; GCN-NEXT: v_mov_b32_e32 v16, s60 ; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen @@ -146,13 +146,13 @@ ; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0 ; GCN-NEXT: v_mov_b32_e32 v4, s61 -; GCN-NEXT: s_movk_i32 s15, 0xd0 -; GCN-NEXT: s_movk_i32 s16, 0xe0 +; GCN-NEXT: s_movk_i32 s17, 0xd0 +; GCN-NEXT: s_movk_i32 s18, 0xe0 ; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s62 ; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0 ; GCN-NEXT: v_mov_b32_e32 v6, s63 -; GCN-NEXT: v_add_u32_e32 v56, s14, v0 +; GCN-NEXT: v_add_u32_e32 v56, s16, v0 ; GCN-NEXT: v_mov_b32_e32 v8, s64 ; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0 ; GCN-NEXT: v_mov_b32_e32 v10, s65 @@ -173,12 +173,12 @@ ; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0 ; GCN-NEXT: v_mov_b32_e32 v4, s37 ; GCN-NEXT: s_and_b32 s7, s7, 63 -; GCN-NEXT: s_movk_i32 s17, 0xf0 +; GCN-NEXT: s_movk_i32 s19, 0xf0 ; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0 ; GCN-NEXT: v_mov_b32_e32 v5, s38 ; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0 ; GCN-NEXT: v_mov_b32_e32 v6, s39 -; GCN-NEXT: v_add_u32_e32 v64, s15, v0 +; GCN-NEXT: v_add_u32_e32 v64, s17, v0 ; GCN-NEXT: v_mov_b32_e32 v8, s40 ; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0 ; GCN-NEXT: v_mov_b32_e32 v10, s41 @@ -186,7 +186,7 @@ ; GCN-NEXT: v_mov_b32_e32 v12, s42 ; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0 ; GCN-NEXT: v_mov_b32_e32 v14, s43 -; GCN-NEXT: v_add_u32_e32 v68, s16, v0 +; GCN-NEXT: v_add_u32_e32 v68, s18, v0 ; GCN-NEXT: v_mov_b32_e32 v16, s44 ; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen @@ -202,7 +202,7 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s46 ; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0 ; GCN-NEXT: v_mov_b32_e32 v6, s47 -; GCN-NEXT: v_add_u32_e32 v72, s17, v0 +; GCN-NEXT: v_add_u32_e32 v72, s19, v0 ; GCN-NEXT: v_mov_b32_e32 v8, s48 ; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0 ; GCN-NEXT: v_mov_b32_e32 v10, s49 @@ -217,9 +217,9 @@ ; GCN-NEXT: v_mov_b32_e32 v4, s12 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 ; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s51 +; GCN-NEXT: v_mov_b32_e32 v14, s51 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256 -; GCN-NEXT: buffer_store_dword v5, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v4, s6 ; GCN-NEXT: v_add_u32_e32 v0, s7, v0 ; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen @@ -289,78 +289,78 @@ ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: s_add_u32 s6, s8, 16 ; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v65, s9 ; GCN-NEXT: v_mov_b32_e32 v67, s7 ; GCN-NEXT: v_mov_b32_e32 v66, s6 ; GCN-NEXT: s_add_u32 s6, s8, 32 -; GCN-NEXT: v_mov_b32_e32 v64, s8 ; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v65, s9 +; GCN-NEXT: s_add_u32 s10, s8, 48 +; GCN-NEXT: v_mov_b32_e32 v64, s8 +; GCN-NEXT: s_addc_u32 s11, s9, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off ; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, 48 -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: s_add_u32 s6, s8, 64 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: s_add_u32 s10, s8, s4 +; GCN-NEXT: s_addc_u32 s11, s9, 0 +; GCN-NEXT: s_add_u32 s4, s8, s5 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, s4 -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: s_add_u32 s4, s8, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_add_u32 s6, s8, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off ; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s10 -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_add_u32 s4, s8, 0x80 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_add_u32 s6, s8, s14 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off ; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s11 +; GCN-NEXT: s_add_u32 s4, s8, s15 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: s_add_u32 s4, s8, s13 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_add_u32 s6, s8, s16 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off ; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off -; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s14 -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: s_add_u32 s4, s8, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off ; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s15 +; GCN-NEXT: s_add_u32 s4, s8, s17 ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: s_add_u32 s4, s8, s16 +; GCN-NEXT: s_add_u32 s4, s8, s18 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off ; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s17 +; GCN-NEXT: s_add_u32 s4, s8, s19 ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -744,13 +744,13 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN: s_getpc_b64 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -624,11 +624,10 @@ ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} - ; FIXEDABI: s_movk_i32 s32, 0x400{{$}} - ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} + ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXME: Why this reload? @@ -670,9 +669,8 @@ ; FIXED-ABI-NOT: v31 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} - ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1364,11 +1364,11 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 -; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 ; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 +; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -312,6 +312,7 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 ; GCN: flat_store_dwordx4 @@ -325,7 +326,6 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -160,16 +160,16 @@ ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN: s_mov_b32 s34, s32 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 - -; GCN: s_mov_b32 s34, s32 -; GCN-NEXT: v_mov_b32_e32 v32, 0 - -; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000 +; GCN: v_mov_b32_e32 v33, 0 + +; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024 + ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -7,11 +7,11 @@ ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ;