Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -87,6 +87,7 @@ SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -574,6 +574,7 @@ setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::MULHU); setTargetDAGCombine(ISD::MULHS); @@ -3119,6 +3120,32 @@ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +SDValue AMDGPUTargetLowering::performTruncateCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + + // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) + if (Src.getOpcode() == ISD::BITCAST) { + EVT SrcVT = Src.getValueType(); + SDValue Vec = Src.getOperand(0); + if (Vec.getOpcode() == ISD::BUILD_VECTOR) { + SDValue Elt0 = Vec.getOperand(0); + EVT EltVT = Elt0.getValueType(); + if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { + if (EltVT.isFloatingPoint()) + return DAG.getNode(ISD::BITCAST, SL, VT, Elt0); + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); + } + } + } + + return SDValue(); +} + // We need to specifically handle i64 mul here to avoid unnecessary conversion // instructions. If we only match on the legalized i64 mul expansion, // SimplifyDemandedBits will be unable to remove them because there will be @@ -3758,6 +3785,8 @@ return performSraCombine(N, DCI); } + case ISD::TRUNCATE: + return performTruncateCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); case ISD::MULHS: Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -282,7 +282,7 @@ } ; GCN-LABEL: {{^}}v3i16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1], off +; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off ; GFX9: s_waitcnt vmcnt(0) ; GFX9: v_lshrrev_b32 ; GFX9: s_setpc_b64 @@ -304,9 +304,8 @@ ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] ; GFX9: buffer_load_ushort v4 +; GFX9: v_lshrrev_b32_e32 v5, 16, v0 ; GFX9: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9: v_mov_b32_e32 v2, v1 -; GFX9: v_lshrrev_b32_e32 v1, 16, v0 ; GCN: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.ll @@ -8,11 +8,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4 v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -26,11 +26,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -44,11 +44,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -62,11 +62,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -80,11 +80,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_gather4_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_gather4_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.gather4.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.ll @@ -41,11 +41,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -59,11 +59,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_cl v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_cl_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.cl.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -77,11 +77,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_c v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_c_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.c.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -95,10 +95,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) @@ -112,10 +113,11 @@ ; UNPACKED: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HI]] ; PACKED: image_sample_c_o v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf d16 -; PACKED: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] +; GFX81: v_lshrrev_b32_e32 v[[HALF:[0-9]+]], 16, v[[HI]] ; GFX81: flat_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]] -; GFX9: global_store_short v[{{[0-9]+:[0-9]+}}], v[[HALF]], off + +; GFX9: global_store_short_d16_hi v[{{[0-9]+:[0-9]+}}], v[[HI]], off define amdgpu_kernel void @image_sample_c_o_v4f16(<4 x float> %coords, <8 x i32> inreg %rsrc, <4 x i32> inreg %sample, half addrspace(1)* %out) { main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.c.o.v4f16.v4f32.v8i32(<4 x float> %coords, <8 x i32> %rsrc, <4 x i32> %sample, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0) Index: test/CodeGen/AMDGPU/store-weird-sizes.ll =================================================================== --- test/CodeGen/AMDGPU/store-weird-sizes.ll +++ test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -3,18 +3,28 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}local_store_i56: -; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 -; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}} +; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 +; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}} + +; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 +; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}} + + define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { store i56 %arg, i56 addrspace(3)* %ptr, align 8 ret void } ; GCN-LABEL: {{^}}local_store_i55: -; GCN-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 -; GCN-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 -; GCN-DAG: ds_write_b32 v0, v{{[0-9]+$}} +; CIVI-DAG: ds_write_b8 v0, v{{[0-9]+}} offset:6 +; CIVI-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; CIVI-DAG: ds_write_b32 v0, v{{[0-9]+$}} + +; GFX9-DAG: ds_write_b8_d16_hi v0, v{{[0-9]+}} offset:6 +; GFX9-DAG: ds_write_b16 v0, v{{[0-9]+}} offset:4 +; GFX9-DAG: ds_write_b32 v0, v{{[0-9]+$}} define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void Index: test/CodeGen/AMDGPU/trunc-combine.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/trunc-combine.ll @@ -0,0 +1,55 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s + +; Make sure high constant 0 isn't pointlessly materialized +; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i16: +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 +define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) { + %srl = lshr i64 %bar, 32 + %trunc = trunc i64 %srl to i16 + ret i16 %trunc +} + +; GCN-LABEL: {{^}}trunc_bitcast_i64_lshr_32_i32: +; GCN: s_waitcnt +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 +define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) { + %srl = lshr i64 %bar, 32 + %trunc = trunc i64 %srl to i32 + ret i32 %trunc +} + +; GCN-LABEL: {{^}}trunc_bitcast_v2i32_to_i16: +; GCN: _load_dword +; GCN-NOT: _load_dword +; GCN-NOT: v_mov_b32 +; GCN: v_add_u32_e32 v0, vcc, 4, v0 +define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) { + %load0 = load i32, i32 addrspace(1)* undef + %load1 = load i32, i32 addrspace(1)* null + %insert.0 = insertelement <2 x i32> undef, i32 %load0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 99, i32 1 + %bc = bitcast <2 x i32> %insert.1 to i64 + %trunc = trunc i64 %bc to i16 + %add = add i16 %trunc, 4 + ret i16 %add +} + +; Make sure there's no crash if the source vector type is FP +; GCN-LABEL: {{^}}trunc_bitcast_v2f32_to_i16: +; GCN: _load_dword +; GCN-NOT: _load_dword +; GCN-NOT: v_mov_b32 +; GCN: v_add_u32_e32 v0, vcc, 4, v0 +define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) { + %load0 = load float, float addrspace(1)* undef + %load1 = load float, float addrspace(1)* null + %insert.0 = insertelement <2 x float> undef, float %load0, i32 0 + %insert.1 = insertelement <2 x float> %insert.0, float 4.0, i32 1 + %bc = bitcast <2 x float> %insert.1 to i64 + %trunc = trunc i64 %bc to i16 + %add = add i16 %trunc, 4 + ret i16 %add +}