diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -283,7 +283,6 @@ ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(C, std::make_unique(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); return DAG; @@ -294,7 +293,6 @@ auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } @@ -308,7 +306,6 @@ auto DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); return DAG; } @@ -604,7 +601,6 @@ createMachineScheduler(MachineSchedContext *C) const override { ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); return DAG; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -40,7 +40,6 @@ ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc @@ -56,214 +55,212 @@ ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 20, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 28, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 28, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: v_add_u32_e32 v2, 36, v0 ; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 36, v0 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 44, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 52, v0 ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v0 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 60, v0 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 60, v0 +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 +; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 ; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 +; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 +; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 +; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 ; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 ; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v16 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 ; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 +; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 ; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 8, v0 -; GCN-NEXT: v_add_u32_e32 v2, 12, v0 -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v0 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v2, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xdc, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 ; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v4, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v0 +; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xfc, v0 ; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 63, v1 +; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v0 +; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 +; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xec, v0 +; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v0 +; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 +; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v0 +; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 63, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen @@ -326,7 +323,6 @@ ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc @@ -342,217 +338,215 @@ ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 ; GCN-NEXT: s_add_u32 s32, s32, 0x10000 ; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 20, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 24, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 28, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 28, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 32, v0 -; GCN-NEXT: v_add_u32_e32 v2, 36, v0 ; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 36, v0 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 44, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 48, v0 -; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 52, v0 ; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v0 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 60, v0 ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 60, v0 +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x44, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 -; GCN-NEXT: buffer_store_dword v20, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 ; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x54, v0 ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 +; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 -; GCN-NEXT: buffer_store_dword v24, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x5c, v0 ; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x64, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 +; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 -; GCN-NEXT: buffer_store_dword v28, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x6c, v0 ; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v30, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x74, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 +; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 -; GCN-NEXT: buffer_store_dword v32, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x7c, v0 ; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x84, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 -; GCN-NEXT: buffer_store_dword v36, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 ; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x94, v0 ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 +; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 -; GCN-NEXT: buffer_store_dword v40, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x9c, v0 ; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xa4, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v8, v15 -; GCN-NEXT: v_mov_b32_e32 v9, v16 ; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v16 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v10, v17 ; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: v_mov_b32_e32 v11, v18 -; GCN-NEXT: v_add_u32_e32 v2, 0xac, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xb4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 +; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 -; GCN-NEXT: buffer_store_dword v48, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xbc, v0 ; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xcc, v0 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: v_add_u32_e32 v7, 8, v0 ; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 12, v0 -; GCN-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v0 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xd0, v0 +; GCN-NEXT: buffer_store_dword v51, v3, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v3, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v52, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v53, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 +; GCN-NEXT: v_add_u32_e32 v3, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v53, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v54, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 +; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v3, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xe8, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v7, 0xf4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf8, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 -; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v56, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v57, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(12) -; GCN-NEXT: v_lshrrev_b32_e32 v2, 1, v10 -; GCN-NEXT: v_and_b32_e32 v1, 63, v2 +; GCN-NEXT: v_add_u32_e32 v3, 0xe8, v0 +; GCN-NEXT: buffer_store_dword v57, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xec, v0 +; GCN-NEXT: buffer_store_dword v58, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xf0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GCN-NEXT: buffer_store_dword v59, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xf4, v0 +; GCN-NEXT: v_and_b32_e32 v1, 63, v1 +; GCN-NEXT: buffer_store_dword v60, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v3, 0xfc, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: buffer_store_dword v62, v3, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload @@ -569,7 +563,7 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: v_and_b32_e32 v1, 1, v10 +; GCN-NEXT: v_and_b32_e32 v1, 1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(15) @@ -585,9 +579,22 @@ ; GCN-LABEL: v_extract_v32i64_varidx: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v15, v0 ; GCN-NEXT: s_add_u32 s4, s32, 0x3fc0 +; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_mov_b32 s6, s33 ; GCN-NEXT: s_and_b32 s33, s4, 0xffffc000 +; GCN-NEXT: s_movk_i32 s4, 0x80 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 +; GCN-NEXT: v_mov_b32_e32 v11, s4 +; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc +; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 +; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v12, s5 +; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill @@ -603,41 +610,8 @@ ; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v15, v0 -; GCN-NEXT: v_mov_b32_e32 v16, v1 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: global_load_dwordx4 v[0:3], v[15:16], off -; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_add_co_u32_e32 v31, vcc, 64, v15 -; GCN-NEXT: s_movk_i32 s4, 0x80 -; GCN-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v16, vcc -; GCN-NEXT: s_add_u32 s32, s32, 0x10000 -; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 -; GCN-NEXT: v_add_co_u32_e32 v48, vcc, v15, v11 -; GCN-NEXT: v_addc_co_u32_e32 v49, vcc, v16, v12, vcc -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v12, s5 -; GCN-NEXT: v_mov_b32_e32 v11, s4 ; GCN-NEXT: v_add_co_u32_e32 v59, vcc, v15, v11 +; GCN-NEXT: global_load_dwordx4 v[3:6], v[15:16], off ; GCN-NEXT: global_load_dwordx4 v[7:10], v[15:16], off offset:16 ; GCN-NEXT: v_addc_co_u32_e32 v60, vcc, v16, v12, vcc ; GCN-NEXT: global_load_dwordx4 v[11:14], v[15:16], off offset:32 @@ -649,198 +623,215 @@ ; GCN-NEXT: global_load_dwordx4 v[35:38], v[48:49], off ; GCN-NEXT: global_load_dwordx4 v[39:42], v[48:49], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[43:46], v[48:49], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[3:6], v[59:60], off ; GCN-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GCN-NEXT: v_add_u32_e32 v0, 0x100, v0 ; GCN-NEXT: v_add_u32_e32 v1, 16, v0 -; GCN-NEXT: v_add_u32_e32 v2, 24, v0 +; GCN-NEXT: s_add_u32 s32, s32, 0x10000 +; GCN-NEXT: s_sub_u32 s32, s32, 0x10000 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: global_load_dwordx4 v[47:50], v[48:49], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[43:46], v[59:60], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill ; GCN-NEXT: global_load_dwordx4 v[51:54], v[59:60], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[55:58], v[59:60], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[59:62], v[59:60], off offset:48 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 20, v0 -; GCN-NEXT: v_add_u32_e32 v1, 44, v0 -; GCN-NEXT: v_add_u32_e32 v7, 28, v0 -; GCN-NEXT: v_add_u32_e32 v9, 36, v0 -; GCN-NEXT: buffer_store_dword v8, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 20, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 24, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 28, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 32, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 36, v0 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 40, v0 -; GCN-NEXT: v_add_u32_e32 v3, 32, v0 ; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 48, v0 +; GCN-NEXT: v_add_u32_e32 v1, 44, v0 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 48, v0 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 52, v0 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 56, v0 -; GCN-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v4, 52, v0 -; GCN-NEXT: v_add_u32_e32 v5, 60, v0 -; GCN-NEXT: buffer_store_dword v15, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x4c, v0 -; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 60, v0 +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 64, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x48, v0 ; GCN-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x44, v0 +; GCN-NEXT: buffer_store_dword v20, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x48, v0 +; GCN-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x4c, v0 +; GCN-NEXT: buffer_store_dword v22, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x50, v0 -; GCN-NEXT: buffer_store_dword v21, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v23, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x58, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x54, v0 +; GCN-NEXT: buffer_store_dword v24, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x58, v0 +; GCN-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x5c, v0 +; GCN-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x60, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0x54, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0x5c, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0x64, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0x6c, v0 -; GCN-NEXT: buffer_store_dword v25, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x68, v0 ; GCN-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x64, v0 +; GCN-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x68, v0 +; GCN-NEXT: buffer_store_dword v29, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x6c, v0 +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x70, v0 -; GCN-NEXT: buffer_store_dword v29, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v7, 0x74, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0x7c, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x78, v0 ; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v33, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v34, v8, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x8c, v0 -; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v38, v2, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x74, v0 +; GCN-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x78, v0 +; GCN-NEXT: buffer_store_dword v33, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v34, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x80, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0x88, v0 ; GCN-NEXT: buffer_store_dword v35, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x84, v0 +; GCN-NEXT: buffer_store_dword v36, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x88, v0 +; GCN-NEXT: buffer_store_dword v37, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x8c, v0 +; GCN-NEXT: buffer_store_dword v38, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0x90, v0 -; GCN-NEXT: buffer_store_dword v37, v2, s[0:3], 0 offen ; GCN-NEXT: buffer_store_dword v39, v1, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0x98, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0x94, v0 +; GCN-NEXT: buffer_store_dword v40, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x98, v0 +; GCN-NEXT: buffer_store_dword v41, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0x9c, v0 +; GCN-NEXT: buffer_store_dword v42, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xa0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0x94, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0x9c, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xa4, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xac, v0 -; GCN-NEXT: buffer_store_dword v41, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v40, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v42, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v46, v6, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xa8, v0 -; GCN-NEXT: buffer_store_dword v43, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v15 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v9, v16 +; GCN-NEXT: v_add_u32_e32 v1, 0xa4, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v10, v17 +; GCN-NEXT: v_add_u32_e32 v1, 0xa8, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v11, v18 +; GCN-NEXT: v_add_u32_e32 v1, 0xac, v0 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xb0, v0 -; GCN-NEXT: buffer_store_dword v45, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v7, 0xb4, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xbc, v0 -; GCN-NEXT: v_add_u32_e32 v2, 0xb8, v0 ; GCN-NEXT: buffer_store_dword v47, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v49, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v48, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v50, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 0xc8, v0 +; GCN-NEXT: v_add_u32_e32 v1, 0xb4, v0 +; GCN-NEXT: buffer_store_dword v48, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xb8, v0 +; GCN-NEXT: buffer_store_dword v49, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xbc, v0 +; GCN-NEXT: buffer_store_dword v50, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: v_add_u32_e32 v1, 0xc0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_u32_e32 v7, 0xec, v0 -; GCN-NEXT: v_add_u32_e32 v8, 0xf4, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v10, v4 -; GCN-NEXT: v_add_u32_e32 v2, 0xc4, v0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v9, v3 -; GCN-NEXT: v_mov_b32_e32 v11, v5 -; GCN-NEXT: v_add_u32_e32 v3, 0xcc, v0 -; GCN-NEXT: buffer_store_dword v10, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: v_add_u32_e32 v2, 8, v0 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc4, v0 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xc8, v0 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xcc, v0 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 8, v0 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 12, v0 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:256 ; GCN-NEXT: v_add_u32_e32 v1, 0xd0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 12, v0 -; GCN-NEXT: v_add_u32_e32 v4, 0xd4, v0 -; GCN-NEXT: v_add_u32_e32 v5, 0xdc, v0 -; GCN-NEXT: v_add_u32_e32 v6, 0xe4, v0 -; GCN-NEXT: v_add_u32_e32 v9, 0xfc, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 4, v0 -; GCN-NEXT: buffer_store_dword v11, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:256 -; GCN-NEXT: v_add_u32_e32 v2, 0xd8, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xe0, v0 ; GCN-NEXT: buffer_store_dword v51, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd4, v0 +; GCN-NEXT: buffer_store_dword v52, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xd8, v0 +; GCN-NEXT: buffer_store_dword v53, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xdc, v0 +; GCN-NEXT: buffer_store_dword v54, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe0, v0 +; GCN-NEXT: buffer_store_dword v55, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xe4, v0 +; GCN-NEXT: buffer_store_dword v56, v1, s[0:3], 0 offen ; GCN-NEXT: v_add_u32_e32 v1, 0xe8, v0 -; GCN-NEXT: buffer_store_dword v53, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v55, v3, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v2, 0xf0, v0 -; GCN-NEXT: v_add_u32_e32 v3, 0xf8, v0 ; GCN-NEXT: buffer_store_dword v57, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v59, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v52, v4, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v54, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v56, v6, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v60, v8, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v62, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 31, v1 +; GCN-NEXT: v_add_u32_e32 v1, 0xec, v0 +; GCN-NEXT: buffer_store_dword v58, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf0, v0 +; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf4, v0 +; GCN-NEXT: buffer_store_dword v60, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xf8, v0 +; GCN-NEXT: buffer_store_dword v61, v1, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v1, 0xfc, v0 +; GCN-NEXT: buffer_store_dword v62, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 31, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN-NEXT: v_add_u32_e32 v1, 4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -10,362 +10,364 @@ ; GCN-NEXT: s_add_u32 s0, s0, s7 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GCN-NEXT: v_mov_b32_e32 v0, 0x100 +; GCN-NEXT: v_mov_b32_e32 v16, 0x100 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: v_add_u32_e32 v1, 4, v0 +; GCN-NEXT: v_add_u32_e32 v31, 64, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0 -; GCN-NEXT: s_load_dwordx16 s[68:83], s[10:11], 0x40 -; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x80 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 -; GCN-NEXT: s_movk_i32 s4, 0x50 +; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80 +; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16 +; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, s13 -; GCN-NEXT: v_mov_b32_e32 v5, s14 -; GCN-NEXT: v_mov_b32_e32 v6, s15 -; GCN-NEXT: v_mov_b32_e32 v8, s16 -; GCN-NEXT: v_mov_b32_e32 v10, s17 -; GCN-NEXT: v_mov_b32_e32 v12, s18 -; GCN-NEXT: v_mov_b32_e32 v14, s19 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NEXT: v_mov_b32_e32 v4, s16 +; GCN-NEXT: v_mov_b32_e32 v5, s17 +; GCN-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NEXT: v_mov_b32_e32 v7, s19 +; GCN-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 +; GCN-NEXT: v_add_u32_e32 v0, 4, v16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s52 +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s4, 0x50 +; GCN-NEXT: v_add_u32_e32 v34, 0x4c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s55 +; GCN-NEXT: buffer_store_dword v1, v34, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v35, s4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NEXT: buffer_store_dword v1, v35, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v36, 0x54, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NEXT: buffer_store_dword v1, v36, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v37, 0x58, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen ; GCN-NEXT: s_movk_i32 s5, 0x60 -; GCN-NEXT: v_add_u32_e32 v2, 8, v0 -; GCN-NEXT: v_add_u32_e32 v3, 12, v0 -; GCN-NEXT: v_add_u32_e32 v7, 16, v0 -; GCN-NEXT: v_add_u32_e32 v9, 20, v0 -; GCN-NEXT: v_add_u32_e32 v11, 24, v0 -; GCN-NEXT: v_add_u32_e32 v13, 28, v0 -; GCN-NEXT: v_add_u32_e32 v15, 32, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s20 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v17, 36, v0 -; GCN-NEXT: v_mov_b32_e32 v18, s21 -; GCN-NEXT: v_mov_b32_e32 v26, s25 -; GCN-NEXT: v_add_u32_e32 v33, 0x44, v0 -; GCN-NEXT: v_mov_b32_e32 v34, s69 -; GCN-NEXT: v_mov_b32_e32 v4, s71 -; GCN-NEXT: v_add_u32_e32 v19, 40, v0 -; GCN-NEXT: v_mov_b32_e32 v20, s22 -; GCN-NEXT: v_add_u32_e32 v21, 44, v0 -; GCN-NEXT: v_mov_b32_e32 v22, s23 -; GCN-NEXT: v_add_u32_e32 v23, 48, v0 -; GCN-NEXT: v_mov_b32_e32 v24, s24 -; GCN-NEXT: v_add_u32_e32 v25, 52, v0 -; GCN-NEXT: v_add_u32_e32 v27, 56, v0 -; GCN-NEXT: v_mov_b32_e32 v28, s26 -; GCN-NEXT: v_add_u32_e32 v29, 60, v0 -; GCN-NEXT: v_mov_b32_e32 v30, s27 -; GCN-NEXT: v_add_u32_e32 v31, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v32, s68 -; GCN-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v24, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v30, v29, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v32, v31, s[0:3], 0 offen -; GCN-NEXT: s_movk_i32 s13, 0x70 -; GCN-NEXT: v_add_u32_e32 v35, 0x48, v0 -; GCN-NEXT: v_mov_b32_e32 v36, s70 -; GCN-NEXT: v_add_u32_e32 v37, 0x4c, v0 -; GCN-NEXT: v_add_u32_e32 v38, s4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s72 -; GCN-NEXT: v_add_u32_e32 v39, 0x54, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s73 -; GCN-NEXT: v_add_u32_e32 v40, 0x58, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s74 -; GCN-NEXT: v_add_u32_e32 v41, 0x5c, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s75 -; GCN-NEXT: v_add_u32_e32 v42, s5, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s76 -; GCN-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v36, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v42, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v26, 0x64, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s77 -; GCN-NEXT: v_mov_b32_e32 v4, s81 -; GCN-NEXT: s_movk_i32 s14, 0x90 -; GCN-NEXT: s_movk_i32 s15, 0xa0 -; GCN-NEXT: v_add_u32_e32 v28, 0x68, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s78 -; GCN-NEXT: v_add_u32_e32 v30, 0x6c, v0 -; GCN-NEXT: v_mov_b32_e32 v18, s79 -; GCN-NEXT: v_add_u32_e32 v32, s13, v0 -; GCN-NEXT: v_mov_b32_e32 v20, s80 -; GCN-NEXT: v_add_u32_e32 v34, 0x74, v0 -; GCN-NEXT: v_add_u32_e32 v36, 0x78, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s82 -; GCN-NEXT: v_add_u32_e32 v43, 0x7c, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s83 -; GCN-NEXT: v_add_u32_e32 v44, 0x80, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s52 -; GCN-NEXT: buffer_store_dword v14, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v4, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v44, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v45, 0x84, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s53 -; GCN-NEXT: s_movk_i32 s16, 0xb0 -; GCN-NEXT: v_add_u32_e32 v46, 0x88, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s54 -; GCN-NEXT: v_add_u32_e32 v47, 0x8c, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s55 -; GCN-NEXT: v_add_u32_e32 v48, s14, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s56 -; GCN-NEXT: v_add_u32_e32 v49, 0x94, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s57 -; GCN-NEXT: v_add_u32_e32 v50, 0x98, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s58 -; GCN-NEXT: v_add_u32_e32 v51, 0x9c, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s59 -; GCN-NEXT: v_add_u32_e32 v52, s15, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s60 -; GCN-NEXT: buffer_store_dword v4, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v53, 0xa4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s61 -; GCN-NEXT: s_movk_i32 s17, 0xd0 -; GCN-NEXT: s_movk_i32 s18, 0xe0 -; GCN-NEXT: v_add_u32_e32 v54, 0xa8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s62 -; GCN-NEXT: v_add_u32_e32 v55, 0xac, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s63 -; GCN-NEXT: v_add_u32_e32 v56, s16, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s64 -; GCN-NEXT: v_add_u32_e32 v57, 0xb4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s65 -; GCN-NEXT: v_add_u32_e32 v58, 0xb8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s66 -; GCN-NEXT: v_add_u32_e32 v59, 0xbc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s67 -; GCN-NEXT: v_add_u32_e32 v60, 0xc0, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s36 -; GCN-NEXT: buffer_store_dword v4, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v60, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v61, 0xc4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s37 +; GCN-NEXT: v_add_u32_e32 v38, 0x5c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s59 +; GCN-NEXT: buffer_store_dword v1, v38, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v39, s5, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NEXT: buffer_store_dword v1, v39, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v40, 0x64, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NEXT: buffer_store_dword v1, v40, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v41, 0x68, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s62 +; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s10, 0x70 +; GCN-NEXT: v_add_u32_e32 v42, 0x6c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NEXT: buffer_store_dword v1, v42, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v43, s10, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NEXT: buffer_store_dword v1, v43, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v44, 0x74, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s65 +; GCN-NEXT: buffer_store_dword v1, v44, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v45, 0x78, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NEXT: buffer_store_dword v1, v45, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s67 +; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s11, 0x90 +; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s39 +; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v51, s11, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s28, 0xa0 +; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v55, s28, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s29, 0xb0 +; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v59, s29, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16 +; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16 +; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16 +; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s12, 0xd0 +; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v67, s12, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s13, 0xe0 +; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v71, s13, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen +; GCN-NEXT: s_movk_i32 s14, 0xf0 +; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v75, s14, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NEXT: s_and_b32 s7, s7, 63 -; GCN-NEXT: s_movk_i32 s19, 0xf0 -; GCN-NEXT: v_add_u32_e32 v62, 0xc8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s38 -; GCN-NEXT: v_add_u32_e32 v63, 0xcc, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s39 -; GCN-NEXT: v_add_u32_e32 v64, s17, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s40 -; GCN-NEXT: v_add_u32_e32 v65, 0xd4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s41 -; GCN-NEXT: v_add_u32_e32 v66, 0xd8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s42 -; GCN-NEXT: v_add_u32_e32 v67, 0xdc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s43 -; GCN-NEXT: v_add_u32_e32 v68, s18, v0 -; GCN-NEXT: v_mov_b32_e32 v16, s44 -; GCN-NEXT: buffer_store_dword v4, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v64, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v65, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v66, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v67, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v68, s[0:3], 0 offen -; GCN-NEXT: v_add_u32_e32 v69, 0xe4, v0 -; GCN-NEXT: v_mov_b32_e32 v4, s45 -; GCN-NEXT: v_add_u32_e32 v70, 0xe8, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s46 -; GCN-NEXT: v_add_u32_e32 v71, 0xec, v0 -; GCN-NEXT: v_mov_b32_e32 v6, s47 -; GCN-NEXT: v_add_u32_e32 v72, s19, v0 -; GCN-NEXT: v_mov_b32_e32 v8, s48 -; GCN-NEXT: v_add_u32_e32 v73, 0xf4, v0 -; GCN-NEXT: v_mov_b32_e32 v10, s49 -; GCN-NEXT: v_add_u32_e32 v74, 0xf8, v0 -; GCN-NEXT: v_mov_b32_e32 v12, s50 -; GCN-NEXT: buffer_store_dword v4, v69, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v70, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v71, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v72, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v73, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v74, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v4, s12 +; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NEXT: v_add_u32_e32 v17, 8, v16 +; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16 +; GCN-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 -; GCN-NEXT: v_add_u32_e32 v75, 0xfc, v0 -; GCN-NEXT: v_mov_b32_e32 v14, s51 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:256 -; GCN-NEXT: buffer_store_dword v14, v75, s[0:3], 0 offen -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_add_u32_e32 v0, s7, v0 -; GCN-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v3, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v4, v7, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v5, v9, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v6, v11, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v7, v13, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v8, v15, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v9, v17, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v10, v19, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v11, v21, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v12, v23, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v13, v25, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v14, v27, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v15, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: v_add_u32_e32 v18, 12, v16 +; GCN-NEXT: v_add_u32_e32 v19, 16, v16 +; GCN-NEXT: v_add_u32_e32 v20, 20, v16 +; GCN-NEXT: v_add_u32_e32 v21, 24, v16 +; GCN-NEXT: v_add_u32_e32 v22, 28, v16 +; GCN-NEXT: v_add_u32_e32 v23, 32, v16 +; GCN-NEXT: v_add_u32_e32 v24, 36, v16 +; GCN-NEXT: v_add_u32_e32 v25, 40, v16 +; GCN-NEXT: v_add_u32_e32 v26, 44, v16 +; GCN-NEXT: v_add_u32_e32 v27, 48, v16 +; GCN-NEXT: v_add_u32_e32 v28, 52, v16 +; GCN-NEXT: v_add_u32_e32 v29, 56, v16 +; GCN-NEXT: v_add_u32_e32 v30, 60, v16 +; GCN-NEXT: buffer_store_dword v1, v78, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_add_u32_e32 v1, s7, v16 +; GCN-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v5, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v6, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v7, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v8, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v9, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v10, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v11, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v12, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v13, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v14, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v15, v30, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v16, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v17, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v18, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v19, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v20, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v21, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v23, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v24, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v25, v26, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v26, v28, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v27, v30, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v28, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v29, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v30, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v31, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v32, v44, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v33, v45, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v34, v46, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v35, v47, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v36, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v37, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v38, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v39, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v40, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v41, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v42, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v43, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v44, v56, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v45, v57, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v46, v58, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v47, v59, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v48, v60, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v49, v61, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v50, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v51, v63, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v52, v64, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v53, v65, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v54, v66, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v55, v67, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v56, v68, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v57, v69, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v58, v70, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v59, v71, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v60, v72, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v61, v73, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v62, v74, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v17, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v18, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v19, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v20, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v21, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v22, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v23, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v24, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v25, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v26, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v27, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v28, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v29, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v30, v45, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v31, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v32, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v33, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v34, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v35, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v36, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v37, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v38, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v39, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v40, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v41, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v42, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v43, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v44, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v45, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v46, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v47, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v48, v63, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v49, v64, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v50, v65, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v51, v66, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v52, v67, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v53, v68, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v54, v69, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v55, v70, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v56, v71, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v57, v72, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v58, v73, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v59, v74, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v60, v75, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v61, v76, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 -; GCN-NEXT: s_add_u32 s6, s8, 16 -; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v67, s7 -; GCN-NEXT: v_mov_b32_e32 v66, s6 -; GCN-NEXT: s_add_u32 s6, s8, 32 -; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v65, s9 -; GCN-NEXT: s_add_u32 s10, s8, 48 +; GCN-NEXT: s_add_u32 s6, s8, 16 ; GCN-NEXT: v_mov_b32_e32 v64, s8 -; GCN-NEXT: s_addc_u32 s11, s9, 0 +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[64:65], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[66:67], v[4:7], off +; GCN-NEXT: s_nop 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, 64 -; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: s_add_u32 s6, s8, 32 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GCN-NEXT: s_addc_u32 s7, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: s_add_u32 s10, s8, s4 -; GCN-NEXT: s_addc_u32 s11, s9, 0 -; GCN-NEXT: s_add_u32 s4, s8, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_add_u32 s6, s8, 48 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[8:11], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[12:15], off +; GCN-NEXT: s_addc_u32 s7, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_add_u32 s6, s8, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[20:23], off +; GCN-NEXT: s_add_u32 s6, s8, 64 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: s_add_u32 s6, s8, s4 +; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[16:19], off +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_add_u32 s4, s8, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[20:23], off +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, 0x80 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_add_u32 s6, s8, s14 +; GCN-NEXT: s_add_u32 s4, s8, s10 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[24:27], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[28:31], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s15 -; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: s_add_u32 s4, s8, 0x80 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[28:31], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: s_add_u32 s6, s8, s16 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s11 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[32:35], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[36:39], off -; GCN-NEXT: s_addc_u32 s7, s9, 0 +; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: s_add_u32 s4, s8, s28 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off +; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s29 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[44:47], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s17 +; GCN-NEXT: s_add_u32 s4, s8, 0xc0 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[44:47], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: s_add_u32 s4, s8, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s12 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[52:55], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: s_add_u32 s4, s8, s19 +; GCN-NEXT: s_add_u32 s4, s8, s13 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off ; GCN-NEXT: s_addc_u32 s5, s9, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_add_u32 s4, s8, s14 ; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off -; GCN-NEXT: global_store_dwordx4 v[2:3], v[60:63], off +; GCN-NEXT: s_addc_u32 s5, s9, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: global_store_dwordx4 v[0:1], v[60:63], off ; GCN-NEXT: s_endpgm %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -1954,7 +1954,7 @@ ; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 @@ -1997,16 +1997,16 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_addc_u32 s1, 0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 -; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_s: @@ -2015,7 +2015,7 @@ ; GFX8-NEXT: s_lshr_b32 s7, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 ; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 2 @@ -2058,16 +2058,16 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_addc_u32 s1, 0, 0 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_s: @@ -2108,24 +2108,25 @@ ; GFX7-NEXT: s_cmp_eq_u32 s7, 4 ; GFX7-NEXT: s_cselect_b32 s4, s16, s12 ; GFX7-NEXT: s_cmp_eq_u32 s7, 5 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s5, s16, s13 ; GFX7-NEXT: s_cmp_eq_u32 s7, 6 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s6, s16, s14 ; GFX7-NEXT: s_cmp_eq_u32 s7, 7 -; GFX7-NEXT: s_cselect_b32 s7, s16, s15 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: v_mov_b32_e32 v5, s5 -; GFX7-NEXT: v_mov_b32_e32 v6, s6 -; GFX7-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-NEXT: s_cselect_b32 s7, s16, s15 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 -; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GFX7-NEXT: s_nop 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 ; GFX7-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -2329,23 +2330,23 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s13 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX9-NEXT: v_mov_b32_e32 v6, s14 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_s: @@ -2390,23 +2391,23 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v6, s14 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 ; GFX8-NEXT: v_mov_b32_e32 v7, s15 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_s: @@ -2509,8 +2510,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s16 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 ; GFX9-NEXT: v_mov_b32_e32 v4, s20 ; GFX9-NEXT: v_mov_b32_e32 v5, s21 @@ -2518,8 +2519,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v7, s23 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2528,11 +2527,13 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_v: @@ -2572,8 +2573,8 @@ ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 ; GFX8-NEXT: v_mov_b32_e32 v4, s20 ; GFX8-NEXT: v_mov_b32_e32 v5, s21 @@ -2581,8 +2582,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v7, s23 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2591,11 +2590,13 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_s_v: @@ -2699,8 +2700,8 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s12 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 ; GFX9-NEXT: v_mov_b32_e32 v3, s15 ; GFX9-NEXT: v_mov_b32_e32 v4, s16 ; GFX9-NEXT: v_mov_b32_e32 v5, s17 @@ -2708,8 +2709,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v7, s19 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX9-NEXT: s_add_u32 s0, 0, 16 -; GFX9-NEXT: s_addc_u32 s1, 0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2718,11 +2717,13 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: s_add_u32 s0, 0, 16 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v10, s0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_v: @@ -2761,8 +2762,8 @@ ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 ; GFX8-NEXT: v_mov_b32_e32 v3, s15 ; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s17 @@ -2770,8 +2771,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v7, s19 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] -; GFX8-NEXT: s_add_u32 s0, 0, 16 -; GFX8-NEXT: s_addc_u32 s1, 0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] @@ -2780,11 +2779,13 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: s_add_u32 s0, 0, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, s0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_s_v16i16_v_v: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -8,39 +8,39 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: s_mov_b32 s5, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v6, vcc, v0, v64 -; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v6 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 -; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v6, v4 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v7, v5, vcc -; GCN-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:32 +; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v0, v64 +; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v4 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GCN-NEXT: s_movk_i32 s4, 0xc0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] ; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:16 ; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:32 ; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:48 ; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[16:17], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[16:17], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[16:17], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48 ; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 ; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 -; GCN-NEXT: s_waitcnt vmcnt(15) +; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 @@ -55,8 +55,8 @@ ; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:16 ; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:32 ; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:80 ; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:96 ; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:112 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -177,35 +177,35 @@ ; GFX7-LABEL: store_lds_v4i32_align1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX7-NEXT: s_mov_b32 m0, -1 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:1 ; GFX7-NEXT: ds_write_b8 v0, v6 offset:2 ; GFX7-NEXT: ds_write_b8 v0, v7 offset:3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX7-NEXT: ds_write_b8 v0, v8 offset:5 -; GFX7-NEXT: ds_write_b8 v0, v9 offset:6 -; GFX7-NEXT: ds_write_b8 v0, v10 offset:7 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v6 offset:7 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:10 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:11 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v4 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:12 -; GFX7-NEXT: ds_write_b8 v0, v6 offset:13 -; GFX7-NEXT: ds_write_b8 v0, v7 offset:14 -; GFX7-NEXT: ds_write_b8 v0, v8 offset:15 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:14 +; GFX7-NEXT: ds_write_b8 v0, v3 offset:15 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 @@ -227,17 +227,17 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v1 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:1 ; GFX7-NEXT: ds_write_b8 v0, v5 offset:2 ; GFX7-NEXT: ds_write_b8 v0, v6 offset:3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX7-NEXT: ds_write_b8 v0, v7 offset:5 -; GFX7-NEXT: ds_write_b8 v0, v8 offset:6 -; GFX7-NEXT: ds_write_b8 v0, v9 offset:7 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v5 offset:7 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -43,50 +43,50 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v7, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mov_b32_e32 v8, s5 -; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX9-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX9-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX9-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX9-NEXT: ds_write_b8 v1, v8 offset:7 ; GFX9-NEXT: s_lshr_b32 s4, s2, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_lshr_b32 s0, s3, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:13 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s2, s3, 24 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:11 -; GFX9-NEXT: ds_write_b8 v1, v5 offset:12 -; GFX9-NEXT: ds_write_b8 v1, v6 offset:13 -; GFX9-NEXT: ds_write_b8 v1, v7 offset:14 -; GFX9-NEXT: ds_write_b8 v1, v8 offset:15 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -96,50 +96,50 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_lshr_b32 s6, s0, 16 ; GFX7-NEXT: s_lshr_b32 s7, s0, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s4, s1, 16 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s6 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s1, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v7, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s7 -; GFX7-NEXT: v_mov_b32_e32 v8, s5 -; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX7-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX7-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX7-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX7-NEXT: ds_write_b8 v1, v8 offset:7 ; GFX7-NEXT: s_lshr_b32 s4, s2, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s1, s3, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:13 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s2, s3, 24 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s1 -; GFX7-NEXT: v_mov_b32_e32 v8, s2 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:11 -; GFX7-NEXT: ds_write_b8 v1, v5 offset:12 -; GFX7-NEXT: ds_write_b8 v1, v6 offset:13 -; GFX7-NEXT: ds_write_b8 v1, v7 offset:14 -; GFX7-NEXT: ds_write_b8 v1, v8 offset:15 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:15 ; GFX7-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -152,26 +152,26 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_lshr_b32 s0, s3, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX9-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX9-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX9-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX9-NEXT: ds_write_b16 v1, v6 offset:10 -; GFX9-NEXT: ds_write_b16 v1, v7 offset:12 -; GFX9-NEXT: ds_write_b16 v1, v8 offset:14 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:14 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -181,26 +181,26 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s0, s3, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_mov_b32_e32 v7, s3 -; GFX7-NEXT: v_mov_b32_e32 v8, s0 -; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX7-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX7-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX7-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX7-NEXT: ds_write_b16 v1, v6 offset:10 -; GFX7-NEXT: ds_write_b16 v1, v7 offset:12 -; GFX7-NEXT: ds_write_b16 v1, v8 offset:14 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:12 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:14 ; GFX7-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -41,39 +41,39 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: s_lshr_b32 s5, s0, 16 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_lshr_b32 s4, s1, 24 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_mov_b32_e32 v7, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 -; GFX9-NEXT: ds_write_b8 v1, v0 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX9-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX9-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX9-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX9-NEXT: ds_write_b8 v1, v8 offset:7 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX9-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX9-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX9-NEXT: ds_write_b8 v1, v4 offset:11 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -83,39 +83,39 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s3, s0, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 ; GFX7-NEXT: s_lshr_b32 s5, s0, 16 ; GFX7-NEXT: s_lshr_b32 s6, s0, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s3, s1, 16 ; GFX7-NEXT: s_lshr_b32 s4, s1, 24 -; GFX7-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: v_mov_b32_e32 v8, s4 -; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:1 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:2 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:3 -; GFX7-NEXT: ds_write_b8 v1, v5 offset:4 -; GFX7-NEXT: ds_write_b8 v1, v6 offset:5 -; GFX7-NEXT: ds_write_b8 v1, v7 offset:6 -; GFX7-NEXT: ds_write_b8 v1, v8 offset:7 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s3, s2, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 -; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX7-NEXT: ds_write_b8 v1, v2 offset:9 -; GFX7-NEXT: ds_write_b8 v1, v3 offset:10 -; GFX7-NEXT: ds_write_b8 v1, v4 offset:11 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 ; GFX7-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -128,21 +128,21 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: ds_write_b16 v1, v0 -; GFX9-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX9-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX9-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX9-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX9-NEXT: ds_write_b16 v1, v6 offset:10 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 offset:10 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -152,21 +152,21 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_lshr_b32 s3, s0, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_lshr_b32 s0, s2, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v5, s2 -; GFX7-NEXT: v_mov_b32_e32 v6, s0 -; GFX7-NEXT: ds_write_b16 v1, v0 -; GFX7-NEXT: ds_write_b16 v1, v2 offset:2 -; GFX7-NEXT: ds_write_b16 v1, v3 offset:4 -; GFX7-NEXT: ds_write_b16 v1, v4 offset:6 -; GFX7-NEXT: ds_write_b16 v1, v5 offset:8 -; GFX7-NEXT: ds_write_b16 v1, v6 offset:10 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 offset:10 ; GFX7-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -3316,13 +3316,14 @@ ; GCN-NEXT: v_and_b32_e32 v2, s3, v3 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc ; GCN-NEXT: v_and_b32_e32 v3, s3, v4 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -3460,9 +3461,10 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -3612,9 +3614,10 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -3780,13 +3783,14 @@ ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 ; GCN-NEXT: v_and_b32_e32 v3, s3, v3 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GCN-NEXT: v_or_b32_e32 v2, v2, v3 ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 +; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; GCN-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -744,13 +744,13 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN: s_getpc_b64 -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { @@ -777,12 +777,12 @@ ; GCN-LABEL: {{^}}stack_12xv3i32: ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 v31, 11 ; GCN: s_getpc @@ -806,12 +806,12 @@ ; GCN-LABEL: {{^}}stack_12xv3f32: ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG12]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12 ; GCN: v_mov_b32_e32 v31, 0x41300000 ; GCN: s_getpc @@ -836,20 +836,20 @@ ; GCN-LABEL: {{^}}stack_8xv5i32: ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 7 @@ -870,20 +870,20 @@ ; GCN-LABEL: {{^}}stack_8xv5f32: ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG8]], {{.*$}} +; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 ; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 ; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 ; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 ; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 ; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 ; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 ; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 0x40e00000 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -31,9 +31,7 @@ %la3 = getelementptr inbounds i32, i32* %lb, i32 6 %ld3 = load i32, i32* %la3 -; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) -; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) -; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) +; DBG-NOT: Cluster ld/st ; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]] ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8 ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16 @@ -78,13 +76,11 @@ %la3 = getelementptr inbounds i32, i32* %lb, i32 6 %ld3 = load i32, i32* %la3 -; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) -; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) -; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) -; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]] +; DBG-NOT: Cluster ld/st ; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]] -; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8 +; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]] ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16 +; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8 ; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24 %sa0 = getelementptr inbounds i32, i32* %sb, i32 0 store i32 %ld0, i32* %sa0 @@ -125,7 +121,6 @@ ; CHECK-LABEL: {{^}}no_cluster_image_load: ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 ; DBG-NOT: {{^}}Cluster ld/st define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -156,28 +156,28 @@ ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v4 -; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v5 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v5 -; GFX7-ALIGNED-NEXT: flat_store_byte v[2:3], v4 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm ; ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1: diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -73,9 +73,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, 2 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 @@ -140,14 +140,14 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 ; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 -; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1084,23 +1084,23 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 ; GFX9-NEXT: v_sub_u32_e32 v1, v1, v6 -; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7 ; GFX9-NEXT: v_add_u32_e32 v1, 8, v1 -; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v8, v5, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX9-NEXT: v_alignbit_b32 v2, v4, v3, v2 ; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 ; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 -; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX9-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -312,7 +312,6 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 ; GCN: flat_store_dwordx4 @@ -326,6 +325,7 @@ ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 +; SI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_e32 ; VI: v_cvt_f32_f16_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -773,12 +773,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s11 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: v_mov_b32_e32 v6, s6 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; VI-NEXT: s_nop 0 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <8 x i32> %a, i32 5, i32 %b store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 @@ -910,9 +911,9 @@ ; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -45,7 +45,7 @@ ; GCN: s_barrier -; SI: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] +; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -70,16 +70,16 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 -; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -529,8 +529,8 @@ ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} ; GCN: buffer_store_dword v[[HI]] define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { store i32 9, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -28,14 +28,14 @@ ; GCN-NEXT: s_cbranch_scc1 BB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_lshl_b32 s7, s10, 2 ; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_add_i32 s6, s6, s7 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -98,14 +98,14 @@ ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 ; GCN-NEXT: s_mov_b32 s32, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_add_i32 s6, s6, s7 -; GCN-NEXT: v_mov_b32_e32 v3, 1 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_add_i32 s6, s6, s7 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 @@ -166,9 +166,9 @@ ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v6, 1 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 @@ -228,9 +228,9 @@ ; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, 1 ; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen offset:4 +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_lshl_add_u32 v2, v3, 2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -249,13 +249,13 @@ ; CI: v_mov_b32 ; CI: v_mov_b32 -; CI: v_add_i32 -; CI: v_add_i32 +; CI-DAG: v_add_i32 +; CI-DAG: v_add_i32 -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} -; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} -; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} +; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} +; CI-DAG: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} +; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 ; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -55,42 +55,42 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s4, s2, 8 -; GFX9-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: s_lshr_b32 s2, s3, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v8, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: ds_write_b8 v0, v6 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v6 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: s_lshr_b32 s0, s3, 24 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX9-NEXT: ds_write_b8 v0, v5 offset:4 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:13 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 24 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:14 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:15 -; GFX9-NEXT: ds_write_b8 v0, v3 offset:9 -; GFX9-NEXT: ds_write_b8 v0, v4 offset:11 -; GFX9-NEXT: ds_write_b8 v0, v6 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:8 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:10 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v5 offset:6 +; GFX9-NEXT: s_lshr_b32 s4, s3, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: ds_write_b8 v0, v7 offset:1 -; GFX9-NEXT: ds_write_b8 v0, v8 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -100,50 +100,50 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshr_b32 s4, s3, 8 -; GFX7-NEXT: v_mov_b32_e32 v5, s4 -; GFX7-NEXT: s_lshr_b32 s4, s3, 16 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshr_b32 s3, s3, 24 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:13 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: v_mov_b32_e32 v6, s4 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:15 -; GFX7-NEXT: ds_write_b8 v0, v6 offset:14 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_lshr_b32 s2, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s4, s3, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s4, s3, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v6, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:11 -; GFX7-NEXT: ds_write_b8 v0, v6 offset:10 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: s_lshr_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_lshr_b32 s0, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v4 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align1: @@ -153,50 +153,50 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_lshr_b32 s4, s3, 8 -; GFX6-NEXT: v_mov_b32_e32 v5, s4 -; GFX6-NEXT: s_lshr_b32 s4, s3, 16 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_lshr_b32 s3, s3, 24 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:13 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: v_mov_b32_e32 v6, s4 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:15 -; GFX6-NEXT: ds_write_b8 v0, v6 offset:14 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 ; GFX6-NEXT: ds_write_b8 v0, v2 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:9 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s4, s3, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s4, s3, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:13 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:15 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8 -; GFX6-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:12 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:11 -; GFX6-NEXT: ds_write_b8 v0, v6 offset:10 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:6 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v4 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v3 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -210,17 +210,17 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:14 -; GFX9-NEXT: ds_write_b16 v0, v4 -; GFX9-NEXT: ds_write_b16 v0, v3 offset:4 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:10 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v4 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align2: @@ -230,26 +230,26 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_mov_b32_e32 v5, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: ds_write_b16 v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s0, s3, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s3, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:14 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX7-NEXT: ds_write_b16 v0, v3 offset:10 -; GFX7-NEXT: ds_write_b16 v0, v4 offset:6 -; GFX7-NEXT: ds_write_b16 v0, v5 offset:2 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align2: @@ -259,26 +259,26 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: v_mov_b32_e32 v5, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: ds_write_b16 v0, v4 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write_b16 v0, v3 offset:4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s3, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:8 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s3, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:14 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:12 -; GFX6-NEXT: ds_write_b16 v0, v3 offset:10 -; GFX6-NEXT: ds_write_b16 v0, v4 offset:6 -; GFX6-NEXT: ds_write_b16 v0, v5 offset:2 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:14 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void @@ -307,10 +307,10 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX7-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s3 +; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v4i32_align4: diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -36,10 +36,10 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out @@ -53,33 +53,33 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_lshr_b32 s3, s2, 8 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshr_b32 s0, s0, 24 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 8 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: ds_write_b8 v0, v4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v4 offset:2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: s_lshr_b32 s0, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b8 v0, v3 offset:4 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_lshr_b32 s0, s1, 24 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:11 -; GFX9-NEXT: ds_write_b8 v0, v4 offset:5 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 +; GFX9-NEXT: s_lshr_b32 s3, s2, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX9-NEXT: ds_write_b8 v0, v5 offset:1 -; GFX9-NEXT: ds_write_b8 v0, v6 offset:3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -89,39 +89,39 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-NEXT: v_mov_b32_e32 v4, s3 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s3, s2, 16 -; GFX7-NEXT: s_lshr_b32 s2, s2, 24 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v4 offset:9 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s3, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_lshr_b32 s2, s1, 8 -; GFX7-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX7-NEXT: ds_write_b8 v0, v5 offset:10 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 16 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_lshr_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s2, s1, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_lshr_b32 s0, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v3 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX7-NEXT: ds_write_b8 v0, v4 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -131,39 +131,39 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: s_lshr_b32 s3, s2, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s3 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_lshr_b32 s2, s2, 24 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v4 offset:9 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s3, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_lshr_b32 s2, s1, 8 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX6-NEXT: ds_write_b8 v0, v5 offset:10 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 16 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: s_lshr_b32 s1, s1, 24 +; GFX6-NEXT: s_lshr_b32 s2, s1, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s1, s0, 8 -; GFX6-NEXT: v_mov_b32_e32 v4, s2 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: ds_write_b8 v0, v4 offset:6 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s1, s0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v3 +; GFX6-NEXT: s_lshr_b32 s1, s0, 24 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 -; GFX6-NEXT: ds_write_b8 v0, v4 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void @@ -178,13 +178,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v3 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -194,21 +194,21 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-NEXT: s_lshr_b32 s0, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_write_b16 v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v3, s0 -; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:10 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v3 offset:6 -; GFX7-NEXT: ds_write_b16 v0, v4 offset:2 +; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_lshr_b32 s0, s0, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -218,21 +218,21 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: v_mov_b32_e32 v4, s0 -; GFX6-NEXT: s_lshr_b32 s0, s1, 16 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: ds_write_b16 v0, v3 -; GFX6-NEXT: v_mov_b32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:10 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v3 offset:6 -; GFX6-NEXT: ds_write_b16 v0, v4 offset:2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void @@ -260,9 +260,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 ; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 -; GFX7-NEXT: ds_write_b32 v0, v3 offset:8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align4: @@ -302,10 +302,10 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX7-NEXT: ds_write_b64 v2, v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -316,10 +316,10 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 @@ -359,10 +359,10 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX6-NEXT: ds_write_b64 v2, v[0:1] ; GFX6-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 -; HAWAII-NEXT: ds_write_b32 v1, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: ds_write_b32 v0, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b32 v0, v1 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v2, s1 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: ds_write_b32 v0, v2 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b32 v0, v1 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s3 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s3 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll --- a/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll +++ b/llvm/test/CodeGen/AMDGPU/token-factor-inline-limit-test.ll @@ -5,37 +5,37 @@ ; GCN-LABEL: {{^}}token_factor_inline_limit_test: ; GCN-TFILD: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFILD: buffer_store_dword [[REG8]], {{.*$}} +; GCN-TFILD: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN-TFILD: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFILD: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN-TFILD: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFILD: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN-TFILD: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFILD: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN-TFILD: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFILD: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN-TFILD: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFILD: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN-TFILD: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFILD: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 ; GCN-TFILD: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN-TFIL7: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 ; GCN-TFIL7: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-TFIL7: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 ; GCN-TFIL7: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-TFIL7: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 ; GCN-TFIL7: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-TFIL7: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 ; GCN-TFIL7: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-TFIL7: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 ; GCN-TFIL7: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-TFIL7: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 ; GCN-TFIL7: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-TFIL7: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 ; GCN-TFIL7: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-TFIL7: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 ; GCN-TFIL7: buffer_store_dword [[REG8]], {{.*$}} ; GCN: v_mov_b32_e32 v31, 7 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -135,12 +135,13 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s7, s7, 34 ; SI-NEXT: s_or_b32 s7, s7, 4 -; SI-NEXT: s_bfe_u32 s8, s7, 0x10010 ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_bfe_u32 s8, s7, 0x10010 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: widen_i17_constant_load: @@ -157,9 +158,9 @@ ; VI-NEXT: s_or_b32 s0, s0, 4 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_bfe_u32 s0, s0, 0x10010 -; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: flat_store_short v[0:1], v4 -; VI-NEXT: flat_store_byte v[2:3], v5 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm %load = load i17, i17 addrspace(4)* %arg, align 4 %add = add i17 %load, 34