Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1531,6 +1531,14 @@ // requirements. // if (Size == 64) { + // SI has a hardware bug in the LDS / GDS bounds checking: if the base + // address is negative, then the instruction is incorrectly treated as + // out-of-bounds even if base + offsets is in bounds. Split vectorized + // loads here to avoid emitting ds_read2_b32. We may re-combine the + // load later in the SILoadStoreOptimizer. + if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) + return false; + // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we // can do a 4 byte aligned, 8 byte access in a single operation using // ds_read2/write2_b32 with adjacent offsets. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -406,8 +406,7 @@ ; GFX6: liveins: $vgpr0 ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6-NEXT: $m0 = S_MOV_B32 -1 - ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3) + ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3) ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p1) = G_LOAD %0 :: (load (p1), align 4, addrspace 3) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir @@ -350,7 +350,6 @@ ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store (p1), align 4, addrspace 3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 @@ -438,7 +437,6 @@ ; GFX6-NEXT: {{ $}} ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6-NEXT: $m0 = S_MOV_B32 -1 ; GFX6-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store (<4 x s16>), align 4, addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir @@ -4145,8 +4145,17 @@ ; SI-LABEL: name: test_load_local_p1_align4 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3) + ; SI-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s32) from unknown-address + 4, addrspace 3) + ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD1]](s32) + ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C1]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]] + ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR]](s64) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; CI-LABEL: name: test_load_local_p1_align4 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3) @@ -8531,8 +8540,30 @@ ; SI-LABEL: name: test_load_local_v4s16_align4 ; SI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 - ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3) - ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3) + ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s16) from unknown-address + 2, addrspace 3) + ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; SI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s16) from unknown-address + 4, align 4, addrspace 3) + ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; SI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s16) from unknown-address + 6, addrspace 3) + ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C3]] + ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C3]] + ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD2]], [[C3]] + ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LOAD3]], [[C3]] + ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]] + ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>) + ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CI-LABEL: name: test_load_local_v4s16_align4 ; CI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3) Index: llvm/test/CodeGen/AMDGPU/store-local.128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -402,12 +402,12 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset0:2 offset1:3 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v4i32_align4: Index: llvm/test/CodeGen/AMDGPU/store-local.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -341,9 +341,9 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX6-NEXT: s_endpgm Index: llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll =================================================================== --- llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll +++ llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -load-store-vectorizer -S -o - %s | FileCheck %s target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"