Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -485,27 +485,14 @@ defm atomic_load_fadd : ret_noret_binary_atomic_op; defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op; - -def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { - let IsLoad = 1; - let IsNonExtLoad = 1; - let MinAlignment = 8; -} - -def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { +def load_align4_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> { let IsLoad = 1; let IsNonExtLoad = 1; - let MinAlignment = 16; -} - -def store_align8_local: PatFrag<(ops node:$val, node:$ptr), - (store_local node:$val, node:$ptr)>, Aligned<8> { - let IsStore = 1; - let IsTruncStore = 0; + let MinAlignment = 4; } -def store_align16_local: PatFrag<(ops node:$val, node:$ptr), - (store_local node:$val, node:$ptr)>, Aligned<16> { +def store_align4_local: PatFrag<(ops node:$val, node:$ptr), + (store_local node:$val, node:$ptr)>, Aligned<4> { let IsStore = 1; let IsTruncStore = 0; } Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -677,10 +677,16 @@ let AddedComplexity = 100 in { foreach vt = VReg_64.RegTypes in { -defm : DSReadPat_mc ; +defm : DSReadPat_mc ; } -defm : DSReadPat_mc ; +foreach vt = VReg_96.RegTypes in { +defm : DSReadPat_mc ; +} + +foreach vt = VReg_128.RegTypes in { +defm : DSReadPat_mc ; +} } // End AddedComplexity = 100 @@ -782,10 +788,16 @@ let AddedComplexity = 100 in { foreach vt = VReg_64.RegTypes in { -defm : DSWritePat_mc ; +defm : DSWritePat_mc ; } -defm : DSWritePat_mc ; +foreach vt = VReg_96.RegTypes in { +defm : DSWritePat_mc ; +} + +foreach vt = VReg_128.RegTypes in { +defm : DSWritePat_mc ; +} } // End AddedComplexity = 100 class DSAtomicRetPat : GCNPat < Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -449,17 +449,11 @@ def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>; } -def load_align8_local_m0 : PatFrag<(ops node:$ptr), +def load_align4_local_m0 : PatFrag<(ops node:$ptr), (load_local_m0 node:$ptr)> { let IsLoad = 1; let IsNonExtLoad = 1; - let MinAlignment = 8; -} -def load_align16_local_m0 : PatFrag<(ops node:$ptr), - (load_local_m0 node:$ptr)> { - let IsLoad = 1; - let IsNonExtLoad = 1; - let MinAlignment = 16; + let MinAlignment = 4; } } // End IsLoad = 1 @@ -535,20 +529,12 @@ } } -def store_align16_local_m0 : PatFrag < - (ops node:$value, node:$ptr), - (store_local_m0 node:$value, node:$ptr)> { - let IsStore = 1; - let IsTruncStore = 0; - let MinAlignment = 16; -} - -def store_align8_local_m0 : PatFrag < +def store_align4_local_m0 : PatFrag < (ops node:$value, node:$ptr), (store_local_m0 node:$value, node:$ptr)> { let IsStore = 1; let IsTruncStore = 0; - let MinAlignment = 8; + let MinAlignment = 4; } let AddressSpaces = StoreAddress_local.AddrSpaces in { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir @@ -61,10 +61,10 @@ ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) ; GFX7-DS128-LABEL: name: load_local_v4s32_align_4 ; GFX7-DS128: liveins: $vgpr0 - ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX7-DS128: [[DS_READ_B128_:%[0-9]+]]:vreg_128 = DS_READ_B128 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_]] ; GFX9-LABEL: name: load_local_v4s32_align_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 @@ -99,10 +99,10 @@ ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; GFX7-DS128-LABEL: name: load_local_v2s64 ; GFX7-DS128: liveins: $vgpr0 - ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; GFX7-DS128: [[DS_READ_B128_:%[0-9]+]]:vreg_128 = DS_READ_B128 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_]] ; GFX9-LABEL: name: load_local_v2s64 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 @@ -175,10 +175,10 @@ ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) ; GFX7-DS128-LABEL: name: load_local_s128 ; GFX7-DS128: liveins: $vgpr0 - ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-DS128: $m0 = S_MOV_B32 -1 - ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128) + ; GFX7-DS128: [[DS_READ_B128_:%[0-9]+]]:vreg_128 = DS_READ_B128 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 16, align 4, addrspace 3) + ; GFX7-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ_B128_]] ; GFX9-LABEL: name: load_local_s128 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -159,19 +159,19 @@ ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] + ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX9-LABEL: name: load_local_v2s32_align4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] ; GFX6-LABEL: name: load_local_v2s32_align4 ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -227,19 +227,19 @@ ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] + ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX9-LABEL: name: load_local_s64_align4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] ; GFX6-LABEL: name: load_local_s64_align4 ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -363,19 +363,19 @@ ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] + ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX9-LABEL: name: load_local_p1_align4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] ; GFX6-LABEL: name: load_local_p1_align4 ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -724,21 +724,21 @@ ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 254, 255, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] + ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 1016, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 1016, 0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1016, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 %2, 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 1016 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -761,26 +761,22 @@ ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec - ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 %2, 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]] + ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 1020, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec - ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 1020, 0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 %2, 0, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 1020 %2:vgpr(p3) = G_PTR_ADD %0, %1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir @@ -274,22 +274,18 @@ ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX7: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX9-LABEL: name: store_local_s64_align4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX6-LABEL: name: store_local_s64_align4 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -315,22 +311,18 @@ ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX7: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX9-LABEL: name: store_local_p1_align4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX6-LABEL: name: store_local_p1_align4 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -356,22 +348,18 @@ ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX7: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX9-LABEL: name: store_local_v2s32_align4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX6-LABEL: name: store_local_v2s32_align4 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -397,22 +385,18 @@ ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX7: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX9-LABEL: name: store_local_v4s16_align4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX6-LABEL: name: store_local_v4s16_align4 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -586,24 +570,20 @@ ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX7: DS_WRITE_B64 [[COPY1]], [[COPY]], 1016, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX9-LABEL: name: store_local_s64_align4_from_1_gep_1016 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 1016, 0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1016, implicit $exec + ; GFX6: %3:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) + ; GFX6: DS_WRITE_B64 %3, [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 %2:vgpr(s32) = G_CONSTANT i32 1016 @@ -630,29 +610,21 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec - ; GFX7: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX7: DS_WRITE2_B32 %3, [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX7: DS_WRITE_B64 [[COPY1]], [[COPY]], 1020, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX9-LABEL: name: store_local_s64_align4_from_1_gep_1020 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec - ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 - ; GFX9: DS_WRITE2_B32_gfx9 [[V_ADD_U32_e64_]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 1020, 0, implicit $exec :: (store 8, align 4, addrspace 3) ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec + ; GFX6: %3:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) + ; GFX6: DS_WRITE_B64 %3, [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 %2:vgpr(s32) = G_CONSTANT i32 1020 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -0,0 +1,469 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-NOUNALIGNED %s + +define <4 x i32> @v_load_lds_v3i32(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[0:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_lds_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr + ret <4 x i32> %load +} + +define <4 x i32> @v_load_lds_v3i32_align1(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u8 v1, v0 +; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX9-NEXT: ds_read_u8 v4, v0 offset:2 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:3 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:4 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v4, v1, v2, v4 +; GFX9-NEXT: ds_read_u8 v1, v0 offset:5 +; GFX9-NEXT: ds_read_u8 v2, v0 offset:6 +; GFX9-NEXT: ds_read_u8 v5, v0 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:8 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v2, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: ds_read_u8 v6, v0 offset:10 +; GFX9-NEXT: ds_read_u8 v7, v0 offset:11 +; GFX9-NEXT: ds_read_u8 v8, v0 offset:12 +; GFX9-NEXT: ds_read_u8 v9, v0 offset:13 +; GFX9-NEXT: ds_read_u8 v10, v0 offset:14 +; GFX9-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX9-NEXT: s_waitcnt lgkmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v6, v6, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v7, v7, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_or3_b32 v2, v2, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v6, v10, v3 +; GFX9-NEXT: v_and_or_b32 v5, v8, v3, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: v_or3_b32 v3, v5, v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u8 v1, v0 +; GFX7-NEXT: ds_read_u8 v2, v0 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v0 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v2, v0 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v2, v6, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: ds_read_u8 v5, v0 offset:10 +; GFX7-NEXT: ds_read_u8 v6, v0 offset:11 +; GFX7-NEXT: ds_read_u8 v7, v0 offset:12 +; GFX7-NEXT: ds_read_u8 v8, v0 offset:13 +; GFX7-NEXT: ds_read_u8 v9, v0 offset:14 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v5, v6, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v6, v9, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_lds_v3i32_align1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 13, v0 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 14, v0 +; GFX6-NEXT: ds_read_u8 v2, v2 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: ds_read_u8 v3, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 +; GFX6-NEXT: ds_read_u8 v9, v0 +; GFX6-NEXT: ds_read_u8 v10, v4 +; GFX6-NEXT: ds_read_u8 v6, v6 +; GFX6-NEXT: ds_read_u8 v7, v7 +; GFX6-NEXT: ds_read_u8 v8, v8 +; GFX6-NEXT: s_waitcnt lgkmcnt(4) +; GFX6-NEXT: v_and_b32_e32 v4, s4, v9 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 5, v0 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, 6, v0 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, 7, v0 +; GFX6-NEXT: ds_read_u8 v9, v9 +; GFX6-NEXT: ds_read_u8 v11, v11 +; GFX6-NEXT: ds_read_u8 v1, v1 +; GFX6-NEXT: ds_read_u8 v12, v12 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, 0xff +; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_or_b32_e32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 10, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 11, v0 +; GFX6-NEXT: ds_read_u8 v2, v2 +; GFX6-NEXT: ds_read_u8 v1, v1 +; GFX6-NEXT: ds_read_u8 v3, v3 +; GFX6-NEXT: v_and_b32_e32 v9, v9, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX6-NEXT: v_and_b32_e32 v10, v10, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_and_b32_e32 v9, v11, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_and_b32_e32 v9, v12, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 9, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 15, v0 +; GFX6-NEXT: ds_read_u8 v9, v9 +; GFX6-NEXT: ds_read_u8 v0, v0 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(2) +; GFX6-NEXT: v_and_b32_e32 v3, v3, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: v_and_b32_e32 v9, v9, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX6-NEXT: v_or_b32_e32 v9, v10, v9 +; GFX6-NEXT: v_or_b32_e32 v2, v9, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, v6, v5 +; GFX6-NEXT: v_and_b32_e32 v6, v7, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX6-NEXT: v_and_b32_e32 v6, v8, v5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 + ret <4 x i32> %load +} + +define <4 x i32> @v_load_lds_v3i32_align2(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_u16 v1, v0 +; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX9-NEXT: ds_read_u16 v5, v0 offset:6 +; GFX9-NEXT: ds_read_u16 v6, v0 offset:8 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, s4, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 +; GFX9-NEXT: ds_read_u16 v2, v0 offset:10 +; GFX9-NEXT: ds_read_u16 v3, v0 offset:12 +; GFX9-NEXT: ds_read_u16 v0, v0 offset:14 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_or_b32 v3, v3, s4, v0 +; GFX9-NEXT: v_and_or_b32 v2, v6, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v1, v0 +; GFX7-NEXT: ds_read_u16 v2, v0 offset:2 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:4 +; GFX7-NEXT: ds_read_u16 v5, v0 offset:6 +; GFX7-NEXT: ds_read_u16 v6, v0 offset:8 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: ds_read_u16 v3, v0 offset:10 +; GFX7-NEXT: ds_read_u16 v5, v0 offset:12 +; GFX7-NEXT: ds_read_u16 v0, v0 offset:14 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_lds_v3i32_align2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 2, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 14, v0 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; GFX6-NEXT: ds_read_u16 v6, v0 +; GFX6-NEXT: ds_read_u16 v2, v2 +; GFX6-NEXT: ds_read_u16 v5, v5 +; GFX6-NEXT: ds_read_u16 v4, v4 +; GFX6-NEXT: ds_read_u16 v7, v7 +; GFX6-NEXT: ds_read_u16 v8, v8 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 6, v0 +; GFX6-NEXT: ds_read_u16 v1, v1 +; GFX6-NEXT: ds_read_u16 v9, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX6-NEXT: s_waitcnt lgkmcnt(4) +; GFX6-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, s4, v9 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v3 +; GFX6-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 + ret <4 x i32> %load +} + +define <4 x i32> @v_load_lds_v3i32_align4(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[0:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_lds_v3i32_align4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 + ret <4 x i32> %load +} + +define <4 x i32> @v_load_lds_v3i32_align8(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[0:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_lds_v3i32_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 + ret <4 x i32> %load +} + +define <4 x i32> @v_load_lds_v3i32_align16(<4 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b128 v[0:3], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: v_load_lds_v3i32_align16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b64 v[2:3], v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 + ret <4 x i32> %load +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -0,0 +1,398 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-NOUNALIGNED %s + +define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[0:2], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: load_lds_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX9-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX9-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NEXT: ds_read_u8 v1, v2 offset:5 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:6 +; GFX9-NEXT: ds_read_u8 v5, v2 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX9-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v4, v5 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX9-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v5, v7, v3, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v5, v4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v4, v2 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v4, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX7-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: load_lds_v3i32_align1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 9, v0 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, 10, v0 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 11, v0 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; GFX6-NEXT: v_add_i32_e32 v11, vcc, 5, v0 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, 6, v0 +; GFX6-NEXT: ds_read_u8 v2, v2 +; GFX6-NEXT: ds_read_u8 v3, v3 +; GFX6-NEXT: ds_read_u8 v9, v0 +; GFX6-NEXT: ds_read_u8 v5, v5 +; GFX6-NEXT: ds_read_u8 v6, v6 +; GFX6-NEXT: ds_read_u8 v7, v7 +; GFX6-NEXT: ds_read_u8 v8, v8 +; GFX6-NEXT: ds_read_u8 v10, v10 +; GFX6-NEXT: ds_read_u8 v11, v11 +; GFX6-NEXT: ds_read_u8 v12, v12 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 7, v0 +; GFX6-NEXT: ds_read_u8 v1, v1 +; GFX6-NEXT: ds_read_u8 v13, v0 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: s_waitcnt lgkmcnt(9) +; GFX6-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xff +; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: v_and_b32_e32 v0, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v11, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v12, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v2, v13, v4 +; GFX6-NEXT: v_and_b32_e32 v3, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v5, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, v7, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, v8, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: ds_read_u16 v1, v2 offset:2 +; GFX9-NEXT: ds_read_u16 v3, v2 offset:4 +; GFX9-NEXT: ds_read_u16 v4, v2 offset:6 +; GFX9-NEXT: ds_read_u16 v5, v2 offset:8 +; GFX9-NEXT: ds_read_u16 v2, v2 offset:10 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: ds_read_u16 v1, v2 offset:2 +; GFX7-NEXT: ds_read_u16 v3, v2 offset:4 +; GFX7-NEXT: ds_read_u16 v4, v2 offset:6 +; GFX7-NEXT: ds_read_u16 v5, v2 offset:8 +; GFX7-NEXT: ds_read_u16 v2, v2 offset:10 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: load_lds_v3i32_align2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 2, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_u16 v1, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; GFX6-NEXT: ds_read_u16 v4, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; GFX6-NEXT: ds_read_u16 v6, v0 +; GFX6-NEXT: ds_read_u16 v5, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, 10, v0 +; GFX6-NEXT: ds_read_u16 v3, v3 +; GFX6-NEXT: ds_read_u16 v7, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_waitcnt lgkmcnt(5) +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(3) +; GFX6-NEXT: v_and_b32_e32 v0, s4, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(1) +; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v3, v5, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[0:2], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: load_lds_v3i32_align4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[0:2], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: load_lds_v3i32_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 + ret <3 x i32> %load +} + +define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: load_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: load_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b96 v[0:2], v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-LABEL: load_lds_v3i32_align16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: ds_read_b32 v2, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 + ret <3 x i32> %load +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -0,0 +1,583 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-NOUNALIGNED %s + +define amdgpu_kernel void @store_lds_v3i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s7, s0, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 1 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 3 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 5 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 7 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 9 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_lshr_b32 s6, s2, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 11 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_add_u32 s0, s4, 12 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s3, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 13 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 14 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s5, s3, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 15 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: s_lshr_b32 s6, s0, 16 +; GFX7-NEXT: s_lshr_b32 s7, s0, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 1 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 3 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s5, s1, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 5 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s6, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_lshr_b32 s7, s1, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 7 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 9 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s5, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_lshr_b32 s6, s2, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 11 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_add_u32 s0, s4, 12 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s1, s3, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 13 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s2, s3, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 14 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s5, s3, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 15 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_lshr_b32 s5, s0, 8 +; GFX6-NEXT: s_lshr_b32 s6, s0, 16 +; GFX6-NEXT: s_lshr_b32 s7, s0, 24 +; GFX6-NEXT: s_add_u32 s0, s4, 1 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 2 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 3 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 4 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_lshr_b32 s5, s1, 8 +; GFX6-NEXT: s_add_u32 s0, s4, 5 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s6, s1, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 6 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: s_lshr_b32 s7, s1, 24 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 7 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s7 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s2, 8 +; GFX6-NEXT: s_add_u32 s0, s4, 9 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s5, s2, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 10 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: s_lshr_b32 s6, s2, 24 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 11 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_add_u32 s0, s4, 12 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_lshr_b32 s1, s3, 8 +; GFX6-NEXT: s_add_u32 s0, s4, 13 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_lshr_b32 s2, s3, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 14 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s5, s3, 24 +; GFX6-NEXT: s_add_u32 s0, s4, 15 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align2(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 12 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s3, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 14 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s5, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 12 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s1, s3, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 14 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 2 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 4 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 6 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s2, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 10 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 12 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_lshr_b32 s1, s3, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 14 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align8(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align16(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_write_b128 v4, v[0:3] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: s_endpgm + store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -0,0 +1,487 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-NOUNALIGNED %s + +define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 1 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 3 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 5 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_lshr_b32 s6, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 7 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 8 +; GFX9-NEXT: s_add_u32 s0, s4, 9 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NEXT: s_add_u32 s0, s4, 11 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s0, 8 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: s_lshr_b32 s6, s0, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 1 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 3 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s1, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 5 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s5, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_lshr_b32 s6, s1, 24 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 7 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 8 +; GFX7-NEXT: s_add_u32 s0, s4, 9 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_lshr_b32 s5, s2, 24 +; GFX7-NEXT: s_add_u32 s0, s4, 11 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align1: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_lshr_b32 s3, s0, 8 +; GFX6-NEXT: s_lshr_b32 s5, s0, 16 +; GFX6-NEXT: s_lshr_b32 s6, s0, 24 +; GFX6-NEXT: s_add_u32 s0, s4, 1 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 2 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 3 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 4 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_lshr_b32 s3, s1, 8 +; GFX6-NEXT: s_add_u32 s0, s4, 5 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s5, s1, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 6 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: s_lshr_b32 s6, s1, 24 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 7 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s2, 8 +; GFX6-NEXT: s_add_u32 s0, s4, 9 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_lshr_b32 s3, s2, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 10 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_lshr_b32 s5, s2, 24 +; GFX6-NEXT: s_add_u32 s0, s4, 11 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v1, v0 +; GFX6-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 2 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_u32 s0, s4, 4 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 6 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_add_u32 s0, s4, 8 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NEXT: s_add_u32 s0, s4, 10 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b16 v1, v0 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 2 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_add_u32 s0, s4, 4 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 6 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_add_u32 s0, s4, 8 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshr_b32 s1, s2, 16 +; GFX7-NEXT: s_add_u32 s0, s4, 10 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align2: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 2 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_add_u32 s0, s4, 4 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 6 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s2, 16 +; GFX6-NEXT: s_add_u32 s0, s4, 10 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v1, v0 +; GFX6-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 + ret void +} + +define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { +; GFX9-LABEL: store_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: store_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: ds_write_b96 v3, v[0:2] +; GFX7-NEXT: s_endpgm +; +; GFX6-LABEL: store_lds_v3i32_align16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_add_u32 s0, s4, 8 +; GFX6-NEXT: ds_write_b64 v2, v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 + ret void +}