Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -229,29 +229,30 @@ >; class AMDGPUImageLoad : Intrinsic < - [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyfloat_ty], // vdata(VGPR) [llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) + llvm_anyint_ty, // rsrc(SGPR) llvm_i32_ty, // dmask(imm) - llvm_i1_ty, // r128(imm) - llvm_i1_ty, // da(imm) llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) + llvm_i1_ty, // slc(imm) + llvm_i1_ty, // lwe(imm) + llvm_i1_ty], // da(imm) [IntrReadMem]>; def int_amdgcn_image_load : AMDGPUImageLoad; def int_amdgcn_image_load_mip : AMDGPUImageLoad; +def int_amdgcn_image_getresinfo : AMDGPUImageLoad; class AMDGPUImageStore : Intrinsic < [], - [llvm_v4f32_ty, // vdata(VGPR) + [llvm_anyfloat_ty, // vdata(VGPR) llvm_anyint_ty, // vaddr(VGPR) - llvm_v8i32_ty, // rsrc(SGPR) + llvm_anyint_ty, // rsrc(SGPR) llvm_i32_ty, // dmask(imm) - llvm_i1_ty, // r128(imm) - llvm_i1_ty, // da(imm) llvm_i1_ty, // glc(imm) - llvm_i1_ty], // slc(imm) + llvm_i1_ty, // slc(imm) + llvm_i1_ty, // lwe(imm) + llvm_i1_ty], // da(imm) []>; def int_amdgcn_image_store : AMDGPUImageStore; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -2446,32 +2446,36 @@ def : ImagePattern(opcode # _V4_V4), v4i32>; } -class ImageLoadPattern : Pat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc, - imm:$slc), - (opcode $addr, $rsrc, +multiclass ImageLoadPattern { + def : Pat < + (v4f32 (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, + i1:$da)), + (opcode $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), 0, 0, (as_i1imm $da)) ->; + 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + >; +} multiclass ImageLoadPatterns { - def : ImageLoadPattern(opcode # _V4_V1), i32>; - def : ImageLoadPattern(opcode # _V4_V2), v2i32>; - def : ImageLoadPattern(opcode # _V4_V4), v4i32>; + defm : ImageLoadPattern(opcode # _V4_V1), i32>; + defm : ImageLoadPattern(opcode # _V4_V2), v2i32>; + defm : ImageLoadPattern(opcode # _V4_V4), v4i32>; } -class ImageStorePattern : Pat < - (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da, - imm:$glc, imm:$slc), - (opcode $data, $addr, $rsrc, +multiclass ImageStorePattern { + def : Pat < + (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, + i1:$lwe, i1:$da), + (opcode $data, $addr, $rsrc, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), 0, 0, (as_i1imm $da)) ->; + 0, 0, (as_i1imm $lwe), (as_i1imm $da)) + >; +} multiclass ImageStorePatterns { - def : ImageStorePattern(opcode # _V4_V1), i32>; - def : ImageStorePattern(opcode # _V4_V2), v2i32>; - def : ImageStorePattern(opcode # _V4_V4), v4i32>; + defm : ImageStorePattern(opcode # _V4_V1), i32>; + defm : ImageStorePattern(opcode # _V4_V2), v2i32>; + defm : ImageStorePattern(opcode # _V4_V4), v4i32>; } class ImageAtomicPattern : Pat < @@ -2592,6 +2596,7 @@ defm : ImagePatterns; defm : ImageLoadPatterns; defm : ImageLoadPatterns; +defm : ImageLoadPattern; defm : ImageStorePatterns; defm : ImageStorePatterns; defm : ImageAtomicPatterns; Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -6,7 +6,7 @@ ;CHECK: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex } @@ -15,7 +15,7 @@ ;CHECK: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex } @@ -24,7 +24,7 @@ ;CHECK: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex } @@ -33,7 +33,7 @@ ;CHECK: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex } @@ -42,7 +42,7 @@ ;CHECK: s_waitcnt vmcnt(0) define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) %elt = extractelement <4 x float> %tex, i32 0 ; Only first component used, test that dmask etc. is changed accordingly ret float %elt @@ -52,7 +52,7 @@ ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { main_body: - call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void } @@ -60,7 +60,7 @@ ;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) { main_body: - call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void } @@ -68,7 +68,7 @@ ;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) { main_body: - call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void } @@ -76,10 +76,24 @@ ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { main_body: - call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void } +;CHECK-LABEL: {{^}}getresinfo: +;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf +define amdgpu_ps void @getresinfo() { +main_body: + %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) + %r0 = extractelement <4 x float> %r, i32 0 + %r1 = extractelement <4 x float> %r, i32 1 + %r2 = extractelement <4 x float> %r, i32 2 + %r3 = extractelement <4 x float> %r, i32 3 + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3) + ret void +} + + ; Ideally, the register allocator would avoid the wait here ; ;CHECK-LABEL: {{^}}image_store_wait: @@ -90,21 +104,25 @@ ;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) { main_body: - call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) - %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) - call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 + +declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 + +declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { nounwind } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -7,9 +7,9 @@ ; CHECK-NEXT: image_store ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) { - call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00 - call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) ret void } @@ -22,17 +22,17 @@ ; CHECK: s_waitcnt ; CHECK-NEXT: image_store define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) { - %t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %t = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00 %c.1 = mul i32 %c, 2 - call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void } declare void @llvm.amdgcn.s.waitcnt(i32) #0 -declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -7,8 +7,8 @@ ;CHECK-NOT: s_wqm define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: - %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) - call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex } @@ -355,7 +355,7 @@ ; CHECK: ; return define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { entry: - call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) br label %loop loop: @@ -415,11 +415,11 @@ } -declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3