Index: lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- lib/Target/AMDGPU/MIMGInstructions.td +++ lib/Target/AMDGPU/MIMGInstructions.td @@ -179,8 +179,8 @@ defm _V3 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper ; - let VDataDwords = 8 in - defm _V8 : MIMG_NoSampler_Src_Helper ; + let VDataDwords = 5 in + defm _V5 : MIMG_NoSampler_Src_Helper ; } } @@ -413,8 +413,8 @@ defm _V3 : MIMG_Sampler_Src_Helper; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper; } } @@ -434,8 +434,8 @@ defm _V2 : MIMG_Sampler_Src_Helper; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper; - let VDataDwords = 8 in - defm _V8 : MIMG_Sampler_Src_Helper; + let VDataDwords = 5 in + defm _V5 : MIMG_Sampler_Src_Helper; } } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4728,14 +4728,14 @@ EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) : AdjEltVT; - // Special case for v8f16. Rather than add support for this, use v4i32 to + // Special case for v6f16. Rather than add support for this, use v3i32 to // extract the data elements - bool V8F16Special = false; - if (CastVT == MVT::v8f16) { - CastVT = MVT::v4i32; + bool V6F16Special = false; + if (NumElts == 6) { + CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); DMaskPop >>= 1; ReqRetNumElts >>= 1; - V8F16Special = true; + V6F16Special = true; AdjVT = MVT::v2i32; } @@ -4765,7 +4765,7 @@ PreTFCRes = BVElts[0]; } - if (V8F16Special) + if (V6F16Special) PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); if (!IsTexFail) { @@ -5004,9 +5004,6 @@ return Undef; } - // Have to use a power of 2 number of dwords - NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); - EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) : MVT::f32; Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -22,7 +22,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} +; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -45,7 +45,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} +; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -76,7 +76,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { @@ -107,7 +107,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { @@ -138,7 +138,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { @@ -169,7 +169,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) { @@ -200,7 +200,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { @@ -231,7 +231,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { @@ -262,7 +262,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { @@ -293,7 +293,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} +; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) { @@ -324,7 +324,7 @@ ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) { @@ -451,7 +451,7 @@ ; NOPRT: v_mov_b32_e32 v2, 0 ; NOPRT-NOT: v_mov_b32_e32 v0 ; NOPRT-NOT: v_mov_b32_e32 v1 -; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} +; GCN: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} ; SIVI: buffer_store_dword v2, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2 define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll @@ -19,7 +19,7 @@ } ; GCN-LABEL: {{^}}load.v3f32.1d: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16 +; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16 define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -57,7 +57,7 @@ } ; GCN-LABEL: {{^}}load.v3f32.2d: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16 +; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16 define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) { main_body: %x = extractelement <2 x i16> %coords, i32 0 @@ -99,7 +99,7 @@ } ; GCN-LABEL: {{^}}load.v3f32.3d: -; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16 +; GCN: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm a16 define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) { main_body: %x = extractelement <2 x i16> %coords_lo, i32 0 Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -15,7 +15,7 @@ ; GCN: v_mov_b32_e32 v2, v0 ; GCN: v_mov_b32_e32 v3, v0 ; GCN: v_mov_b32_e32 v4, v0 -; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}} +; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}} define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -155,7 +155,7 @@ ; GCN: v_mov_b32_e32 v2, v0 ; GCN: v_mov_b32_e32 v3, v0 ; GCN: v_mov_b32_e32 v4, v0 -; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}} +; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}} define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) @@ -537,7 +537,7 @@ } ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe: -; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}} +; GCN: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}} define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) { main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)