Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -143,6 +143,35 @@ [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] >; +class AMDGPUImageLoad : Intrinsic < + [llvm_v4f32_ty], // vdata(VGPR) + [llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i1_ty, // r128(imm) + llvm_i1_ty, // da(imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + [IntrReadMem]>; + +def int_amdgcn_image_load : AMDGPUImageLoad; +def int_amdgcn_image_load_mip : AMDGPUImageLoad; + +class AMDGPUImageStore : Intrinsic < + [], + [llvm_v4f32_ty, // vdata(VGPR) + llvm_anyint_ty, // vaddr(VGPR) + llvm_v8i32_ty, // rsrc(SGPR) + llvm_i32_ty, // dmask(imm) + llvm_i1_ty, // r128(imm) + llvm_i1_ty, // da(imm) + llvm_i1_ty, // glc(imm) + llvm_i1_ty], // slc(imm) + []>; + +def int_amdgcn_image_store : AMDGPUImageStore; +def int_amdgcn_image_store_mip : AMDGPUImageStore; + def int_amdgcn_read_workdim : AMDGPUReadPreloadRegisterIntrinsic < "__builtin_amdgcn_read_workdim">; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2766,12 +2766,13 @@ SelectionDAG &DAG) const { const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); + unsigned Opcode = Node->getMachineOpcode(); - if (TII->isMIMG(Node->getMachineOpcode())) + if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore()) adjustWritemask(Node, DAG); - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + if (Opcode == AMDGPU::INSERT_SUBREG || + Opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -2905,12 +2905,12 @@ class MIMG_NoSampler_Helper op, string asm, RegisterClass dst_rc, - RegisterClass src_rc, + RegisterClass addr_rc, string dns=""> : MIMG_Helper < op, (outs dst_rc:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, addr_rc:$vaddr, SReg_256:$srsrc), asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," #" $tfe, $lwe, $slc, $vaddr, $srsrc", @@ -2937,6 +2937,41 @@ defm _V4 : MIMG_NoSampler_Src_Helper ; } +class MIMG_Store_Helper op, string asm, + RegisterClass data_rc, + RegisterClass addr_rc> : MIMG_Helper < + op, + (outs), + (ins data_rc:$vdata, i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, addr_rc:$vaddr, + SReg_256:$srsrc), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc"> { + let ssamp = 0; + let mayLoad = 1; // TableGen requires this for matching with the intrinsics + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; +} + +multiclass MIMG_Store_Addr_Helper op, string asm, + RegisterClass data_rc, + int channels> { + def _V1 : MIMG_Store_Helper , + MIMG_Mask; + def _V2 : MIMG_Store_Helper , + MIMG_Mask; + def _V4 : MIMG_Store_Helper , + MIMG_Mask; +} + +multiclass MIMG_Store op, string asm> { + defm _V1 : MIMG_Store_Addr_Helper ; + defm _V2 : MIMG_Store_Addr_Helper ; + defm _V3 : MIMG_Store_Addr_Helper ; + defm _V4 : MIMG_Store_Addr_Helper ; +} + class MIMG_Sampler_Helper op, string asm, RegisterClass dst_rc, RegisterClass src_rc, Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -1063,8 +1063,8 @@ //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; //def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; +defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; @@ -2230,8 +2230,8 @@ // Image only class ImagePattern : Pat < - (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, + imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), $addr, $rsrc) @@ -2243,6 +2243,32 @@ def : ImagePattern(opcode # _V4_V4), v4i32>; } +class ImageLoadPattern : Pat < + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc, + imm:$slc), + (opcode (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), 0, 0, (as_i1imm $slc), $addr, $rsrc) +>; + +multiclass ImageLoadPatterns { + def : ImageLoadPattern(opcode # _V4_V1), i32>; + def : ImageLoadPattern(opcode # _V4_V2), v2i32>; + def : ImageLoadPattern(opcode # _V4_V4), v4i32>; +} + +class ImageStorePattern : Pat < + (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da, + imm:$glc, imm:$slc), + (opcode $data, (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $da), + (as_i1imm $r128), 0, 0, (as_i1imm $slc), $addr, $rsrc) +>; + +multiclass ImageStorePatterns { + def : ImageStorePattern(opcode # _V4_V1), i32>; + def : ImageStorePattern(opcode # _V4_V2), v2i32>; + def : ImageStorePattern(opcode # _V4_V4), v4i32>; +} + // Basic sample defm : SampleRawPatterns; defm : SampleRawPatterns; @@ -2339,6 +2365,10 @@ def : ImagePattern; defm : ImagePatterns; defm : ImagePatterns; +defm : ImageLoadPatterns; +defm : ImageLoadPatterns; +defm : ImageStorePatterns; +defm : ImageStorePatterns; /* SIsample for simple 1D texture lookup */ def : Pat < @@ -2420,27 +2450,6 @@ IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, v16i32>; -/* int_SI_imageload for texture fetches consuming varying address parameters */ -class ImageLoadPattern : Pat < - (name addr_type:$addr, v8i32:$rsrc, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayPattern : Pat < - (name addr_type:$addr, v8i32:$rsrc, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadMSAAPattern : Pat < - (name addr_type:$addr, v8i32:$rsrc, TEX_MSAA), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayMSAAPattern : Pat < - (name addr_type:$addr, v8i32:$rsrc, TEX_ARRAY_MSAA), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -0,0 +1,111 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}image_load_v4i32: +;CHECK: image_load v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v[0:3], s[0:7] +;CHECK: s_waitcnt vmcnt(0) +define <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret <4 x float> %tex +} + +;CHECK-LABEL: {{^}}image_load_v2i32: +;CHECK: image_load v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v[0:1], s[0:7] +;CHECK: s_waitcnt vmcnt(0) +define <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret <4 x float> %tex +} + +;CHECK-LABEL: {{^}}image_load_i32: +;CHECK: image_load v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v0, s[0:7] +;CHECK: s_waitcnt vmcnt(0) +define <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret <4 x float> %tex +} + +;CHECK-LABEL: {{^}}image_load_mip: +;CHECK: image_load_mip v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v[0:3], s[0:7] +;CHECK: s_waitcnt vmcnt(0) +define <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret <4 x float> %tex +} + +;CHECK-LABEL: {{^}}image_load_1: +;CHECK: image_load v0, 1, -1, 0, 0, 0, 0, 0, 0, v[0:3], s[0:7] +;CHECK: s_waitcnt vmcnt(0) +define float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + %elt = extractelement <4 x float> %tex, i32 0 +; Only first component used, test that dmask etc. is changed accordingly + ret float %elt +} + +;CHECK-LABEL: {{^}}image_store_v4i32: +;CHECK: image_store v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v[4:7], s[0:7] +define void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { +main_body: + call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}image_store_v2i32: +;CHECK: image_store v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v[4:5], s[0:7] +define void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 { +main_body: + call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}image_store_i32: +;CHECK: image_store v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v4, s[0:7] +define void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 { +main_body: + call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}image_store_mip: +;CHECK: image_store_mip v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v[4:7], s[0:7] +define void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { +main_body: + call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}image_store_wait: +;CHECK: image_store v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v4, s[0:7] +;CHECK: s_waitcnt vmcnt(0) expcnt(0) +;CHECK: image_load v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v4, s[8:15] +;CHECK: s_waitcnt vmcnt(0) +;CHECK: image_store v[0:3], 15, -1, 0, 0, 0, 0, 0, 0, v4, s[16:23] +define void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) #0 { +main_body: + call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 + +declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly }