Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -822,6 +822,11 @@ return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } + // \returns true if the subtarget supports DWORDX3 load/store instructions. + bool hasDwordx3LoadStores() const { + return CIInsts; + } + bool hasSMovFedHazard() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -160,7 +160,7 @@ bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); - static bool widthsFit(const CombineInfo &CI); + static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); static unsigned getNewOpcode(const CombineInfo &CI); static std::pair getSubRegIdxs(const CombineInfo &CI); const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); @@ -367,11 +367,12 @@ return false; } -bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) { +bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, + const CombineInfo &CI) { const unsigned Width = (CI.Width0 + CI.Width1); switch (CI.InstClass) { default: - return Width <= 4; + return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); case S_BUFFER_LOAD_IMM: switch (Width) { default: @@ -645,7 +646,7 @@ // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (widthsFit(CI) && offsetsCanBeCombined(CI)) + if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } Index: llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/trunk/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -37,9 +37,10 @@ ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]] ; GCN-NOT: v_cvt_f32_ubyte3_e32 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v[[HIRESULT:[0-9]+]], [[VAL]] -; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[VAL]] +; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[MDRESULT:[0-9]+]], [[VAL]] ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] -; GCN: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, +; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[MDRESULT]]{{\]}}, +; VI: buffer_store_dwordx3 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ llvm/trunk/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -60,7 +60,8 @@ ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN-DAG: buffer_store_dwordx3 +; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx2 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -195,7 +195,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged: ;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { main_body: @@ -245,7 +245,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged: ;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { main_body: Index: llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll +++ llvm/trunk/test/CodeGen/AMDGPU/merge-stores.ll @@ -1,5 +1,5 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s ; This test is mostly to test DAG store merging, so disable the vectorizer. ; Run with devices with different unaligned load restrictions. @@ -65,8 +65,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32: -; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 -; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b +; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8 +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -87,8 +87,8 @@ } ; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32: -; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 -; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0 +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 { %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1 @@ -164,9 +164,10 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_constants_i32: -; SI-DAG: buffer_store_dwordx3 -; SI-NOT: buffer_store_dwordx2 -; SI-NOT: buffer_store_dword +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v +; CI-DAG: buffer_store_dwordx3 +; GCN-NOT: buffer_store_dword ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -274,9 +275,13 @@ } ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx3 +; SI-DAG: buffer_load_dwordx2 +; SI-DAG: buffer_load_dword v +; CI-DAG: buffer_load_dwordx3 ; GCN: s_waitcnt -; SI-DAG: buffer_store_dwordx3 v +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v +; CI-DAG: buffer_store_dwordx3 ; GCN: s_endpgm define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 @@ -561,7 +566,9 @@ ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 -; GCN: buffer_store_dwordx3 +; SI: buffer_store_dwordx2 +; SI: buffer_store_dword v +; CI: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 @@ -608,11 +615,15 @@ ; GCN-LABEL: {{^}}copy_v3i32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 { @@ -639,11 +650,15 @@ ; GCN-LABEL: {{^}}copy_v3f32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; GCN-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt ; GCN-NOT: offen -; GCN-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4 Index: llvm/trunk/test/CodeGen/AMDGPU/store-global.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/store-global.ll +++ llvm/trunk/test/CodeGen/AMDGPU/store-global.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s @@ -273,7 +273,10 @@ } ; FUNC-LABEL: {{^}}store_v3i32: -; SIVI-DAG: buffer_store_dwordx3 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v + +; VI-DAG: buffer_store_dwordx3 ; GFX9-DAG: global_store_dwordx2 ; GFX9-DAG: global_store_dword v Index: llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/store-v3i64.ll @@ -89,7 +89,9 @@ } ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32: -; GCN-DAG: buffer_store_dwordx3 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v +; VI-DAG: buffer_store_dwordx3 define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) { %trunc = trunc <3 x i64> %x to <3 x i32> store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out