Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -57,6 +57,7 @@ SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -260,6 +260,41 @@ return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); } +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } +} + bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { @@ -268,7 +303,7 @@ return false; switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: { if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. @@ -281,51 +316,50 @@ // because it has never been validated. return isLegalFlatAddressingMode(AM); } - // fall-through - case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and - // additionally can do r + r + i with addr64. 32-bit has more addressing - // mode options. Depending on the resource constant, it can also do - // (i64 r0) + (i32 r1) * (i14 i). - // - // SMRD instructions have an 8-bit, dword offset. - // - // Assume nonunifom access, since the address space isn't enough to know - // what instruction we will use, and since we don't know if this is a load - // or store and scalar stores are only available on VI. - // - // We also know if we are doing an extload, we can't do a scalar load. - // - // Private arrays end up using a scratch buffer most of the time, so also - // assume those use MUBUF instructions. Scratch loads / stores are currently - // implemented as mubuf instructions with offen bit set, so slightly - // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) - return false; - // FIXME: Since we can split immediate into soffset and immediate offset, - // would it make sense to allow any immediate? + return isLegalMUBUFAddressingMode(AM); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // If the offset isn't a multiple of 4, it probably isn't going to be + // correctly aligned. + if (AM.BaseOffs % 4 != 0) + return isLegalMUBUFAddressingMode(AM); + + // There are no SMRD extloads, so if we have to do a small type access we + // will use a MUBUF load. + // FIXME?: We also need to do this if unaligned, but we don't know the + // alignment here. + if (DL.getTypeStoreSize(Ty) < 4) + return isLegalMUBUFAddressingMode(AM); + + // SMRD instructions have an 8-bit, dword offset on SI. + // On CI+, this can also be a 32-bit literal constant offset. + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (!isUInt<8>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + if (!isUInt<32>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Offset is 20-bit bytes. + if (!isUInt<20>(AM.BaseOffs)) + return false; + } else + llvm_unreachable("unhandled generation"); - switch (AM.Scale) { - case 0: // r + i or just i, depending on HasBaseReg. + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; - case 1: - return true; // We have r + r or r + i. - case 2: - if (AM.HasBaseReg) { - // Reject 2 * r + r. - return false; - } - // Allow 2 * r as r + r - // Or 2 * r + i is allowed as r + r + i. + if (AM.Scale == 1 && AM.HasBaseReg) return true; - default: // Don't allow n * r - return false; - } + + return false; } + + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + return isLegalMUBUFAddressingMode(AM); + case AMDGPUAS::LOCAL_ADDRESS: case AMDGPUAS::REGION_ADDRESS: { // Basic, single offset DS instructions allow a 16-bit unsigned immediate Index: test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +; OPT-LABEL: @test_no_sink_flat_small_offset_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %in +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: +; GCN: flat_load_dword +; GCN: {{^}}BB0_2: +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(4)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,5 +1,7 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s @@ -115,35 +117,6 @@ ret void } -; OPT-LABEL: @test_no_sink_flat_small_offset_i32( -; OPT: getelementptr i32, i32 addrspace(4)* %in -; OPT: br i1 -; OPT-NOT: ptrtoint - -; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: -; GCN: flat_load_dword -; GCN: {{^}}BB4_2: - -define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { -entry: - %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 - %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 - %tmp0 = icmp eq i32 %cond, 0 - br i1 %tmp0, label %endif, label %if - -if: - %tmp1 = load i32, i32 addrspace(4)* %in.gep - br label %endif - -endif: - %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] - store i32 %x, i32 addrspace(4)* %out.gep - br label %done - -done: - ret void -} - ; OPT-LABEL: @test_sink_scratch_small_offset_i32( ; OPT-NOT: getelementptr [512 x i32] ; OPT: br i1 @@ -153,7 +126,7 @@ ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} -; GCN: {{^}}BB5_2: +; GCN: {{^}}BB4_2: define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 @@ -189,7 +162,7 @@ ; GCN: s_and_saveexec_b64 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: {{^}}BB6_2: +; GCN: {{^}}BB5_2: define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { entry: %alloca = alloca [512 x i32], align 4 @@ -222,7 +195,7 @@ ; GCN: s_and_saveexec_b64 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] -; GCN: {{^}}BB7_2: +; GCN: {{^}}BB6_2: define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { entry: %offset.ext = zext i32 %offset to i64 @@ -246,3 +219,219 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind } + + + +; OPT-LABEL: @test_sink_constant_small_offset_i32 +; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32: +; GCN: s_and_saveexec_b64 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32 +; OPT-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32: +; GCN: s_and_saveexec_b64 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32 +; OPT-SI: getelementptr i32, i32 addrspace(2)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32: +; GCN: s_and_saveexec_b64 +; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x400 + +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32 +; OPT-SI: getelementptr i32, i32 addrspace(2)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 3{{$}} +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -4{{$}} +; GCN: s_add_u32 +; GCN: s_addc_u32 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32 +; OPT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32: +; GCN: s_and_saveexec_b64 +; GCN: s_add_u32 +; GCN: s_addc_u32 +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967296 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_i32: +; GCN: s_and_saveexec_b64 +; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} +; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} + +; FIXME: CI depends on other patch +; CI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc{{$}} +; CI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} + +; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}} + +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32 +; OPT-SI: getelementptr i32, i32 addrspace(2)* +; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)* +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32: +; GCN: s_and_saveexec_b64 +; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000{{$}} +; GCN: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}} +; GCN: s_or_b64 exec, exec +define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(2)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} Index: test/CodeGen/AMDGPU/salu-to-valu.ll =================================================================== --- test/CodeGen/AMDGPU/salu-to-valu.ll +++ test/CodeGen/AMDGPU/salu-to-valu.ll @@ -75,10 +75,10 @@ ret void } -; Test moving ann SMRD with an immediate offset to the VALU +; Test moving an SMRD with an immediate offset to the VALU ; CHECK-LABEL: {{^}}smrd_valu2: -; CHECK: buffer_load_dword +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) { entry: %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -89,6 +89,34 @@ ret void } +; CHECK-LABEL: {{^}}smrd_valu2_max_smrd_offset: +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}} +define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %1 = add i32 %0, 4 + %2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %0, i32 255 + %3 = load i32, i32 addrspace(2)* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; Offset is too big to fit in SMRD 8-bit offset, but small enough to +; fit in MUBUF offset. +; FIXME: We should be using the offset but we don't + +; CHECK-LABEL: {{^}}smrd_valu2_mubuf_offset: +; CHECK: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) { +entry: + %0 = call i32 @llvm.r600.read.tidig.x() nounwind readnone + %1 = add i32 %0, 4 + %2 = getelementptr [1024 x i32], [1024 x i32] addrspace(2)* %in, i32 %0, i32 256 + %3 = load i32, i32 addrspace(2)* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + ; CHECK-LABEL: {{^}}s_load_imm_v8i32: ; CHECK: buffer_load_dwordx4 ; CHECK: buffer_load_dwordx4