diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -467,57 +467,19 @@ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - const MachineOperand *FirstDst = nullptr; - const MachineOperand *SecondDst = nullptr; - - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || - (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 7; - if (NumLoads > MaxGlobalLoadCluster) - return false; - - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); - if (!FirstDst) - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); - if (!SecondDst) - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (!FirstDst || !SecondDst) - return false; - - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; + assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && + "Invalid NumLoads/NumBytes values"); - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); + unsigned MaxNumLoads; + if (NumBytes <= 4 * NumLoads) { + // Loads are dword or smaller (on average). + MaxNumLoads = 5; + } else { + // Loads are bigger than a dword (on average). + MaxNumLoads = 4; + } - // FIXME: NumLoads should not be subtracted 1. This is to match behavior - // of clusterNeighboringMemOps which was previosly passing cluster length - // less 1. LoadClusterThreshold should be tuned instead. - return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= - LoadClusterThreshold; + return NumLoads <= MaxNumLoads; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -235,17 +235,17 @@ ; ; GFX8-LABEL: test_div_fmas_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_and_b32 s2, 1, s5 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0xb8 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, 1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -527,43 +527,43 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { ; GFX7-LABEL: test_div_fmas_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11 -; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: s_and_b32 s2, 1, s8 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_mov_b32_e32 v5, s7 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX7-NEXT: s_nop 3 -; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s10 +; GFX7-NEXT: s_and_b32 s0, 1, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_and_b32 s2, 1, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 -; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_endpgm +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_and_b32 s0, 1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm ; ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 79 +; TRAP-HANDLER-DISABLE: NumSgprs: 77 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -46,8 +46,8 @@ ; Test various offset boundaries. ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}} %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 %load11 = load i64, i64 addrspace(1)* %gep11 %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -690,8 +690,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s11 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v0, s10 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,10 +855,10 @@ ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 -; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -51,38 +51,38 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 -; GCN-NEXT: v_mov_b32_e32 v12, s18 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NEXT: v_mov_b32_e32 v13, s19 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 +; GCN-NEXT: v_mov_b32_e32 v16, s18 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s19 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: v_mov_b32_e32 v13, s13 +; GCN-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 +; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 +; GCN-NEXT: s_endpgm bb: %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16 %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -17,9 +17,9 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 @@ -86,6 +86,7 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -95,10 +96,8 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -300,9 +299,9 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -456,10 +455,10 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -203,10 +203,10 @@ ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 +; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,32 +11,32 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xf -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s8, 0 -; SI-NEXT: s_cbranch_scc0 BB0_2 -; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s0, s11, s0 -; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccz BB0_3 -; SI-NEXT: s_branch BB0_4 -; SI-NEXT: BB0_2: -; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: ; implicit-def: $sgpr0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-NEXT: s_cbranch_vccnz BB0_4 -; SI-NEXT: BB0_3: ; %if -; SI-NEXT: s_sub_i32 s0, s9, s10 -; SI-NEXT: BB0_4: ; %endif -; SI-NEXT: s_add_i32 s0, s0, s8 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT:; %bb.1: ; %else +; SI-NEXT: s_add_i32 s2, s7, s2 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; SI-NEXT: s_cbranch_vccz BB0_3 +; SI-NEXT: s_branch BB0_4 +; SI-NEXT:BB0_2: +; SI-NEXT: s_mov_b64 s[8:9], -1 +; SI-NEXT: ; implicit-def: $sgpr2 +; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; SI-NEXT: s_cbranch_vccnz BB0_4 +; SI-NEXT:BB0_3: ; %if +; SI-NEXT: s_sub_i32 s2, s5, s6 +; SI-NEXT:BB0_4: ; %endif +; SI-NEXT: s_add_i32 s4, s2, s4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -446,68 +446,68 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 -; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 -; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 -; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] -; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s6, 64, s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s4, s16, 64 +; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 +; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: s_sub_i32 s6, 64, s20 +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s4, s20, 64 +; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 +; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 +; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] +; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_endpgm %shift = shl <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -516,68 +516,68 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 -; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 -; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 -; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 -; GCN-NEXT: v_mov_b32_e32 v6, s3 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s6, 64, s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s4, s16, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 +; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: s_sub_i32 s6, 64, s20 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s4, s20, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 +; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 +; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] +; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 +; GCN-NEXT: v_mov_b32_e32 v6, s3 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_endpgm %shift = lshr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -586,72 +586,72 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 -; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 -; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 -; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 -; GCN-NEXT: s_ashr_i32 s4, s11, 31 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 -; GCN-NEXT: s_ashr_i32 s4, s15, 31 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NEXT: v_mov_b32_e32 v10, s2 -; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s6, 64, s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s4, s16, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 +; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: s_sub_i32 s6, 64, s20 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s4, s20, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 +; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 +; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] +; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 +; GCN-NEXT: s_ashr_i32 s4, s11, 31 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 +; GCN-NEXT: s_ashr_i32 s4, s15, 31 +; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_endpgm %shift = ashr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,72 +30,72 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_add_u32 s0, s4, 14 -; HAWAII-NEXT: s_addc_u32 s1, s5, 0 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v3, s1 -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v3 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_add_u32 s0, s4, 14 +; HAWAII-NEXT: s_addc_u32 s1, s5, 0 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s2 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s0 -; FIJI-NEXT: s_and_b32 s3, s1, 0xffff -; FIJI-NEXT: s_add_u32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_addc_u32 s1, s5, 0 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: ds_write_b16 v2, v3 offset:4 -; FIJI-NEXT: v_mov_b32_e32 v3, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v2, v0 offset:6 -; FIJI-NEXT: ds_write_b32 v2, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s6, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v3, s3 +; FIJI-NEXT: s_and_b32 s7, s6, 0xffff +; FIJI-NEXT: s_add_u32 s0, s4, 14 +; FIJI-NEXT: s_addc_u32 s1, s5, 0 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: v_mov_b32_e32 v2, s6 +; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s7, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -103,31 +103,31 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b32 v0, v1 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v2, s1 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: ds_write_b32 v0, v2 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b32 v0, v1 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v2, s1 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: ds_write_b32 v0, v2 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -148,35 +148,35 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v0, s3 -; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v3, s3 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v0, s3 -; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v3, s3 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -220,22 +220,22 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 -; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll @@ -38,10 +38,10 @@ } ; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32: -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16 -; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off +; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:48 +; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:32 +; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16 +; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) { entry: %trunc = trunc <16 x i64> %in to <16 x i32>