Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -272,7 +272,7 @@ case 128: break; case 96: - if (!ST.hasDwordx3LoadStores()) + if (!ST.hasDwordx3LoadStores() || AS == AMDGPUAS::LOCAL_ADDRESS) return false; break; case 256: @@ -865,7 +865,7 @@ // TODO: May be able to widen depending on alignment etc. unsigned NumRegs = (MemSize + 31) / 32; if (NumRegs == 3) { - if (!ST.hasDwordx3LoadStores()) + if (!ST.hasDwordx3LoadStores() || AS == AMDGPUAS::LOCAL_ADDRESS) return true; } else { // If the alignment allows, these should have been widened. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -1728,16 +1728,34 @@ ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align8 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; VI-LABEL: name: test_load_local_s96_align8 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; GFX9-LABEL: name: test_load_local_s96_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 8, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -1771,16 +1789,34 @@ ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; CI-DS128-LABEL: name: test_load_local_s96_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; VI-LABEL: name: test_load_local_s96_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) ; GFX9-LABEL: name: test_load_local_s96_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96) + ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -8121,16 +8157,34 @@ ; CI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; CI-DS128-LABEL: name: test_load_local_v3s32_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; CI-DS128: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; CI-DS128: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; VI-LABEL: name: test_load_local_v3s32_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; GFX9-LABEL: name: test_load_local_v3s32_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -11749,34 +11803,67 @@ ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; CI-DS128-LABEL: name: test_extload_local_v2s96_from_24_align4 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96) - ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96) + ; CI-DS128: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; CI-DS128: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; CI-DS128: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0 + ; CI-DS128: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; VI-LABEL: name: test_extload_local_v2s96_from_24_align4 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; VI: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96) + ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0 + ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; VI: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96) + ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; GFX9: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 4, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 @@ -11835,34 +11922,67 @@ ; CI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; CI-DS128-LABEL: name: test_extload_local_v2s96_from_24_align16 ; CI-DS128: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; CI-DS128: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3) - ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3) + ; CI-DS128: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; CI-DS128: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96) - ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96) - ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; CI-DS128: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; CI-DS128: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; CI-DS128: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96) + ; CI-DS128: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0 + ; CI-DS128: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; CI-DS128: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CI-DS128: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; CI-DS128: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; CI-DS128: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; CI-DS128: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; CI-DS128: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0 + ; CI-DS128: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; CI-DS128: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; CI-DS128: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; VI-LABEL: name: test_extload_local_v2s96_from_24_align16 ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3) - ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; VI: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96) - ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96) - ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96) + ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0 + ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; VI: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0 + ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; VI: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align16 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF + ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96) + ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96) + ; GFX9: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 16, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=-unaligned-buffer-access < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-NOUNALIGNED %s + +; FIXME: +; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s + +define <3 x i32> @v_load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 +; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr + ret <3 x i32> %load +} + +define <3 x i32> @v_load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_u8 v0, v0 +; GFX9-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX9-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX9-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v4 +; GFX9-NEXT: ds_read_u8 v1, v2 offset:5 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:6 +; GFX9-NEXT: ds_read_u8 v5, v2 offset:7 +; GFX9-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX9-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v5, v5, v3 +; GFX9-NEXT: v_and_or_b32 v1, v6, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v1, v1, v4, v5 +; GFX9-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX9-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v5, v7, v3, v5 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX9-NEXT: v_or3_b32 v2, v5, v4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: ds_read_u8 v0, v0 +; GFX7-NEXT: ds_read_u8 v1, v2 offset:1 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:2 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:3 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:4 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: ds_read_u8 v4, v2 offset:5 +; GFX7-NEXT: ds_read_u8 v5, v2 offset:6 +; GFX7-NEXT: ds_read_u8 v6, v2 offset:7 +; GFX7-NEXT: ds_read_u8 v7, v2 offset:8 +; GFX7-NEXT: ds_read_u8 v8, v2 offset:9 +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v4, v5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v4, v6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: ds_read_u8 v4, v2 offset:10 +; GFX7-NEXT: ds_read_u8 v2, v2 offset:11 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v6, v8, v3 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 + ret <3 x i32> %load +} + +define <3 x i32> @v_load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_u16 v0, v0 +; GFX9-NEXT: ds_read_u16 v1, v2 offset:2 +; GFX9-NEXT: ds_read_u16 v3, v2 offset:4 +; GFX9-NEXT: ds_read_u16 v4, v2 offset:6 +; GFX9-NEXT: ds_read_u16 v5, v2 offset:8 +; GFX9-NEXT: ds_read_u16 v2, v2 offset:10 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_waitcnt lgkmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v1 +; GFX9-NEXT: v_and_or_b32 v2, v5, s4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_u16 v0, v0 +; GFX7-NEXT: ds_read_u16 v1, v2 offset:2 +; GFX7-NEXT: ds_read_u16 v3, v2 offset:4 +; GFX7-NEXT: ds_read_u16 v4, v2 offset:6 +; GFX7-NEXT: ds_read_u16 v5, v2 offset:8 +; GFX7-NEXT: ds_read_u16 v2, v2 offset:10 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(4) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(3) +; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(2) +; GFX7-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 + ret <3 x i32> %load +} + +define <3 x i32> @v_load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 + ret <3 x i32> %load +} + +define <3 x i32> @v_load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 +; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 + ret <3 x i32> %load +} + +define <3 x i32> @v_load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) { +; GFX9-LABEL: v_load_lds_v3i32_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_load_lds_v3i32_align16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 +; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 + ret <3 x i32> %load +}