Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -258,10 +258,11 @@ return 512; } + // 64-bit DS opcodes cause incorrect rendering in Hitman on SI. if (AddrSpace == AMDGPUAS::FLAT_ADDRESS || AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) - return 128; + return ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ? 32 : 128; if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -1,4 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,SI +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS ; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -483,7 +486,9 @@ } ; CHECK-LABEL: @merge_local_store_2_constants_i32 -; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 +; SI: store i32 +; SI: store i32 +; CIPLUS: store <2 x i32> , <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4 define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 @@ -504,7 +509,11 @@ } ; CHECK-LABEL: @merge_local_store_4_constants_i32 -; CHECK: store <4 x i32> , <4 x i32> addrspace(3)* +; SI: store i32 +; SI: store i32 +; SI: store i32 +; SI: store i32 +; CIPLUS: store <4 x i32> , <4 x i32> addrspace(3)* define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll @@ -1,4 +1,7 @@ -; RUN: opt -mtriple=amdgcn-- -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tahiti -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,SI +; RUN: opt -mtriple=amdgcn-- -mcpu=bonaire -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-- -mcpu=fiji -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx900 -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -8,7 +11,9 @@ ; increase to an align 8 load. ; CHECK-LABEL: @load_keep_base_alignment_missing_align( -; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 +; SI: load float +; SI: load float +; CIPLUS: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) { %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11 %val0 = load float, float addrspace(3)* %ptr0 @@ -22,7 +27,9 @@ ; CHECK-LABEL: @store_keep_base_alignment_missing_align( -; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 +; SI: store float +; SI: store float +; CIPLUS: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4 define amdgpu_kernel void @store_keep_base_alignment_missing_align() { %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1 %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -1,4 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,SI +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -7,7 +10,9 @@ @0 = internal addrspace(3) global [16384 x i32] undef ; CHECK-LABEL: @no_crash( -; CHECK: store <2 x i32> zeroinitializer +; SI: store i32 0 +; SI: store i32 0 +; CIPLUS: store <2 x i32> zeroinitializer ; CHECK: store i32 0 ; CHECK: store i32 0 @@ -29,9 +34,15 @@ ; longest chain vectorized ; CHECK-LABEL: @interleave_get_longest -; CHECK: load <4 x i32> +; SI: load i32 +; SI: load i32 +; SI: store i32 +; SI: store i32 +; SI: load i32 +; SI: load i32 +; CIPLUS: load <4 x i32> ; CHECK: load i32 -; CHECK: store <2 x i32> zeroinitializer +; CIPLUS: store <2 x i32> zeroinitializer ; CHECK: load i32 ; CHECK: load i32 ; CHECK: load i32 Index: test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll @@ -1,4 +1,7 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,SI +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s --check-prefixes=CHECK,CIPLUS target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" @@ -24,10 +27,14 @@ } ; CHECK-LABEL: @merge_v2p3i8( -; CHECK: load <2 x i32> -; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* -; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* -; CHECK: store <2 x i32> zeroinitializer +; SI: load i8 +; SI: load i8 +; SI: store i8 +; SI: store i8 +; CIPLUS: load <2 x i32> +; CIPLUS: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* +; CIPLUS: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)* +; CIPLUS: store <2 x i32> zeroinitializer define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1 @@ -104,9 +111,11 @@ } ; CHECK-LABEL: @merge_load_i32_ptr32( -; CHECK: load <2 x i32> -; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1 -; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)* +; SI: load i32 +; SI: load i8 +; CIPLUS: load <2 x i32> +; CIPLUS: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1 +; CIPLUS: inttoptr i32 [[ELT1]] to i8 addrspace(3)* define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 { entry: %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1 @@ -119,9 +128,11 @@ } ; CHECK-LABEL: @merge_load_ptr32_i32( -; CHECK: load <2 x i32> -; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0 -; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)* +; SI: load i8 +; SI: load i32 +; CIPLUS: load <2 x i32> +; CIPLUS: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0 +; CIPLUS: inttoptr i32 [[ELT0]] to i8 addrspace(3)* define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 { entry: %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* @@ -134,9 +145,11 @@ } ; CHECK-LABEL: @merge_store_ptr32_i32( -; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32 -; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 -; CHECK: store <2 x i32> +; SI: store i8 +; SI: store i32 +; CIPLUS: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32 +; CIPLUS: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0 +; CIPLUS: store <2 x i32> define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 { entry: %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)* @@ -149,9 +162,11 @@ } ; CHECK-LABEL: @merge_store_i32_ptr32( -; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32 -; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1 -; CHECK: store <2 x i32> +; SI: store i32 +; SI: store i8 +; CIPLUS: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32 +; CIPLUS: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1 +; CIPLUS: store <2 x i32> define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 { entry: %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1