Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1433,7 +1433,8 @@ AddrSpace == AMDGPUAS::REGION_ADDRESS) { // Check if alignment requirements for ds_read/write instructions are // disabled. - if (Subtarget->hasUnalignedDSAccessEnabled()) { + if (Subtarget->hasUnalignedDSAccessEnabled() && + !Subtarget->hasLDSMisalignedBug()) { if (IsFast) *IsFast = true; return true; @@ -1451,10 +1452,7 @@ } if (Size == 96) { // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() && - !Subtarget->hasLDSMisalignedBug()) - ? 4 - : 16); + bool Aligned = Alignment >= Align(16); if (IsFast) *IsFast = Aligned; @@ -1464,10 +1462,7 @@ // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we // can do a 8 byte aligned, 16 byte access in a single operation using // ds_read2/write2_b64. - bool Aligned = Alignment >= Align((Subtarget->hasUnalignedDSAccess() && - !Subtarget->hasLDSMisalignedBug()) - ? 4 - : 8); + bool Aligned = Alignment >= Align(8); if (IsFast) *IsFast = Aligned; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lds-misaligned-bug.ll @@ -21,12 +21,10 @@ } ; GCN-LABEL: test_local_misaligned_v4: -; VECT-DAG: ds_read_b128 -; VECT-DAG: ds_write_b128 -; SPLIT-DAG: ds_read2_b32 -; SPLIT-DAG: ds_read2_b32 -; SPLIT-DAG: ds_write2_b32 -; SPLIT-DAG: ds_write2_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_write2_b32 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -46,12 +44,10 @@ } ; GCN-LABEL: test_local_misaligned_v3: -; VECT-DAG: ds_read_b96 -; VECT-DAG: ds_write_b96 -; SPLIT-DAG: ds_read2_b32 -; SPLIT-DAG: ds_read_b32 -; SPLIT-DAG: ds_write2_b32 -; SPLIT-DAG: ds_write_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_read_b32 +; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_write_b32 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -1894,8 +1894,14 @@ ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; GFX9-LABEL: name: test_load_local_s96_align8 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, align 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 8, addrspace 3) @@ -1954,8 +1960,14 @@ ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; GFX9-LABEL: name: test_load_local_s96_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p3) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load 12, align 4, addrspace 3) @@ -3217,8 +3229,18 @@ ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; GFX9-LABEL: name: test_load_local_s128_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p3) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load 16, align 4, addrspace 3) @@ -8321,8 +8343,14 @@ ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) ; GFX9-LABEL: name: test_load_local_v3s32_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -8458,8 +8486,18 @@ ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; GFX9-LABEL: name: test_load_local_v4s32_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p3) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -9373,8 +9411,12 @@ ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; GFX9-LABEL: name: test_load_local_v2s64_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 8, align 4, addrspace 3) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -10193,8 +10235,18 @@ ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; GFX9-LABEL: name: test_load_local_v2p1_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3) - ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) + ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 4, addrspace 3) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 12, addrspace 3) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) %0:_(p3) = COPY $vgpr0 %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 3) @@ -11994,16 +12046,27 @@ ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align4 ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3) - ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) - ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) - ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) - ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) - ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96) + ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4 + 8, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY [[DEF]](<3 x s32>) + ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[COPY1]], [[LOAD]](<2 x s32>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64 + ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4 + 20, addrspace 3) + ; GFX9: [[INSERT2:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD2]](<2 x s32>), 0 + ; GFX9: [[INSERT3:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64 + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT3]](<3 x s32>) + ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) + ; GFX9: [[COPY3:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) + ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96) + ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96) %0:_(p3) = COPY $vgpr0 %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 4, addrspace 3) %2:_(s96) = G_EXTRACT %1, 0 @@ -12106,8 +12169,14 @@ ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 12 + 12, align 4, addrspace 3) - ; GFX9: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; GFX9: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8 + 12, align 4, addrspace 3) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD]], [[C1]](s32) + ; GFX9: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load 4 + 20, addrspace 3) + ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF + ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD1]](<2 x s32>), 0 + ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD2]](s32), 64 + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[INSERT1]](<3 x s32>) ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -244,7 +244,9 @@ ; GFX9-LABEL: load_lds_v4i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -202,7 +202,9 @@ ; GFX9-LABEL: load_lds_v3i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -223,7 +225,9 @@ ; GFX9-LABEL: load_lds_v3i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -212,12 +212,13 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: ds_write2_b32 v1, v3, v0 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align4: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -178,11 +178,12 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 +; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align4: @@ -208,11 +209,12 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] +; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align8: Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -480,7 +480,11 @@ ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} ; CI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 ; CI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 -; GFX9: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] + +; GFX9-ALIGNED-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 +; GFX9-ALIGNED-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 + +; GFX9-UNALIGNED: ds_read_b128 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -441,7 +441,11 @@ ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} ; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; CI-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 -; GFX9-DAG: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}} + +; GFX9-ALIGNED-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GFX9-ALIGNED-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 + +; GFX9-UNALIGNED: ds_write_b128 [[PTR]], {{v\[[0-9]+:[0-9]+\]}} ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { @@ -514,7 +518,11 @@ ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}} -; GFX9: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} + +; GFX9-ALIGNED-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}} +; GFX9-ALIGNED-DAG: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:2 offset1:3{{$}} + +; GFX9-UNALIGNED: ds_write_b128 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in Index: llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll +++ llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll @@ -21,12 +21,10 @@ } ; GCN-LABEL: test_local_misaligned_v4: -; VECT-DAG: ds_read_b128 -; VECT-DAG: ds_write_b128 -; SPLIT-DAG: ds_read2_b32 -; SPLIT-DAG: ds_read2_b32 -; SPLIT-DAG: ds_write2_b32 -; SPLIT-DAG: ds_write2_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_write2_b32 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -46,12 +44,10 @@ } ; GCN-LABEL: test_local_misaligned_v3: -; VECT-DAG: ds_read_b96 -; VECT-DAG: ds_write_b96 -; SPLIT-DAG: ds_read2_b32 -; SPLIT-DAG: ds_read_b32 -; SPLIT-DAG: ds_write2_b32 -; SPLIT-DAG: ds_write_b32 +; GCN-DAG: ds_read2_b32 +; GCN-DAG: ds_read_b32 +; GCN-DAG: ds_write2_b32 +; GCN-DAG: ds_write_b32 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) { bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/load-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -288,7 +288,9 @@ ; GFX9-LABEL: load_lds_v4i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b128 v[0:3], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/load-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -239,7 +239,9 @@ ; GFX9-LABEL: load_lds_v3i32_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -272,7 +274,9 @@ ; GFX9-LABEL: load_lds_v3i32_align8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b96 v[0:2], v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 +; GFX9-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -290,12 +290,13 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: ds_write_b128 v4, v[0:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: ds_write2_b32 v0, v3, v1 offset0:2 offset1:3 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v4i32_align4: Index: llvm/test/CodeGen/AMDGPU/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -244,11 +244,12 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align4: @@ -288,11 +289,12 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b96 v3, v[0:2] +; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 +; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align8: Index: llvm/test/CodeGen/AMDGPU/store-local.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.ll +++ llvm/test/CodeGen/AMDGPU/store-local.ll @@ -179,9 +179,8 @@ ; CM: LDS_WRITE ; CM: LDS_WRITE -; SICIVI: ds_write2_b32 -; SICIVI: ds_write2_b32 -; GFX9: ds_write_b128 +; GCN: ds_write2_b32 +; GCN: ds_write2_b32 define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { entry: store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4