Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -368,9 +368,9 @@ // 1. Handle half data type like v4f16, and add D16 bit support; // 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). // 3. Add A16 support when we pass address of half type. -multiclass AMDGCNSamplePattern { +multiclass AMDGCNSamplePattern { def : Pat< - (v4f32 (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, + (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), (opcode $addr, $rsrc, $sampler, (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), @@ -378,12 +378,19 @@ >; } +multiclass AMDGCNSampleDataPatterns { + defm : AMDGCNSamplePattern(opcode # _V1), dt, f32>; + defm : AMDGCNSamplePattern(opcode # _V2), dt, v2f32>; + defm : AMDGCNSamplePattern(opcode # _V4), dt, v4f32>; + defm : AMDGCNSamplePattern(opcode # _V8), dt, v8f32>; + defm : AMDGCNSamplePattern(opcode # _V16), dt, v16f32>; +} + +// TODO: support v3f32. multiclass AMDGCNSamplePatterns { - defm : AMDGCNSamplePattern(opcode # _V4_V1), f32>; - defm : AMDGCNSamplePattern(opcode # _V4_V2), v2f32>; - defm : AMDGCNSamplePattern(opcode # _V4_V4), v4f32>; - defm : AMDGCNSamplePattern(opcode # _V4_V8), v8f32>; - defm : AMDGCNSamplePattern(opcode # _V4_V16), v16f32>; + defm : AMDGCNSampleDataPatterns(opcode # _V1), f32>; + defm : AMDGCNSampleDataPatterns(opcode # _V2), v2f32>; + defm : AMDGCNSampleDataPatterns(opcode # _V4), v4f32>; } // Image only @@ -401,9 +408,9 @@ def : ImagePattern(opcode # _V4_V4), v4i32>; } -multiclass ImageLoadPattern { +multiclass ImageLoadPattern { def : Pat < - (v4f32 (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, + (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), (opcode $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), @@ -411,15 +418,22 @@ >; } +multiclass ImageLoadDataPatterns { + defm : ImageLoadPattern(opcode # _V1), dt, i32>; + defm : ImageLoadPattern(opcode # _V2), dt, v2i32>; + defm : ImageLoadPattern(opcode # _V4), dt, v4i32>; +} + +// TODO: support v3f32. multiclass ImageLoadPatterns { - defm : ImageLoadPattern(opcode # _V4_V1), i32>; - defm : ImageLoadPattern(opcode # _V4_V2), v2i32>; - defm : ImageLoadPattern(opcode # _V4_V4), v4i32>; + defm : ImageLoadDataPatterns(opcode # _V1), f32>; + defm : ImageLoadDataPatterns(opcode # _V2), v2f32>; + defm : ImageLoadDataPatterns(opcode # _V4), v4f32>; } -multiclass ImageStorePattern { +multiclass ImageStorePattern { def : Pat < - (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, + (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, i1:$da), (opcode $data, $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), @@ -427,10 +441,17 @@ >; } +multiclass ImageStoreDataPatterns { + defm : ImageStorePattern(opcode # _V1), dt, i32>; + defm : ImageStorePattern(opcode # _V2), dt, v2i32>; + defm : ImageStorePattern(opcode # _V4), dt, v4i32>; +} + +// TODO: support v3f32. multiclass ImageStorePatterns { - defm : ImageStorePattern(opcode # _V4_V1), i32>; - defm : ImageStorePattern(opcode # _V4_V2), v2i32>; - defm : ImageStorePattern(opcode # _V4_V4), v4i32>; + defm : ImageStoreDataPatterns(opcode # _V1), f32>; + defm : ImageStoreDataPatterns(opcode # _V2), v2f32>; + defm : ImageStoreDataPatterns(opcode # _V4), v4f32>; } class ImageAtomicPattern : Pat < @@ -558,7 +579,7 @@ // Image load defm : ImageLoadPatterns; defm : ImageLoadPatterns; -defm : ImageLoadPattern; +defm : ImageLoadPatterns; // Image store defm : ImageStorePatterns; @@ -613,49 +634,35 @@ defm : AMDGCNSamplePatterns; // Gather opcodes -// Only the variants which make sense are defined. -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; - -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; - -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; - -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; - -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; -defm : AMDGCNSamplePattern; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; + +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; + +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; + +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; +defm : AMDGCNSamplePatterns; + +defm : AMDGCNSamplePatterns; // Image atomics defm : ImageAtomicPatterns; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3883,13 +3883,16 @@ if (TII->isMIMG(MI)) { unsigned VReg = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + // TODO: Need mapping tables to handle other cases (register classes). + if (RC != &AMDGPU::VReg_128RegClass) + return; + unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) BitsSet += Writemask & (1 << i) ? 1 : 0; - - const TargetRegisterClass *RC; switch (BitsSet) { default: return; case 1: RC = &AMDGPU::VGPR_32RegClass; break; Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll @@ -320,6 +320,23 @@ ret void } +; GCN-LABEL: {{^}}gather4_f32: +; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da +define void @gather4_f32(float addrspace(1)* %out) { +main_body: + %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}gather4_v2f32: +; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da +define void @gather4_v2f32(<2 x float> addrspace(1)* %out) { +main_body: + %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1) + store <2 x float> %r, <2 x float> addrspace(1)* %out + ret void +} declare <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 @@ -360,5 +377,7 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -48,6 +48,25 @@ ret float %elt } +;CHECK-LABEL: {{^}}image_load_f32_v2i32: +;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +;CHECK: s_waitcnt vmcnt(0) +define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { +main_body: + %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) + ret float %tex +} + +;CHECK-LABEL: {{^}}image_load_v2f32_v4i32: +;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +;CHECK: s_waitcnt vmcnt(0) +define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { +main_body: + %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) + ret <2 x float> %tex +} + + ;CHECK-LABEL: {{^}}image_store_v4i32: ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { @@ -72,6 +91,22 @@ ret void } +;CHECK-LABEL: {{^}}image_store_f32_i32: +;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) { +main_body: + call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}image_store_v2f32_v4i32: +;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) { +main_body: + call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) + ret void +} + ;CHECK-LABEL: {{^}}image_store_mip: ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { @@ -93,7 +128,6 @@ ret void } - ; Ideally, the register allocator would avoid the wait here ; ;CHECK-LABEL: {{^}}image_store_wait: @@ -110,6 +144,13 @@ ret void } + +declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 + + declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll @@ -181,6 +181,23 @@ ret void } +; GCN-LABEL: {{^}}sample_f32: +; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 +define void @sample_f32(float addrspace(1)* %out) { +main_body: + %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}sample_v2f32: +; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 +define void @sample_v2f32(<2 x float> addrspace(1)* %out) { +main_body: + %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) + store <2 x float> %r, <2 x float> addrspace(1)* %out + ret void +} declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 @@ -204,5 +221,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 attributes #0 = { nounwind readnone }