Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7179,6 +7179,22 @@ } } + // Fold truncate of a bitcast of a vector to an extract of the low vector + // element. + // + // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0 + if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { + SDValue VecSrc = N0.getOperand(0); + EVT SrcVT = VecSrc.getValueType(); + if (SrcVT.isVector() && SrcVT.getScalarType() == VT) { + SDLoc SL(N); + + EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, + VecSrc, DAG.getConstant(0, SL, IdxVT)); + } + } + // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -396,12 +396,11 @@ ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} - ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 +; GCN-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} ; GCN: v_cvt_f32_f16_e32 -; GCN-NOT: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16 ; GCN: v_cvt_f64_f32_e32 ; GCN: v_cvt_f64_f32_e32 Index: test/CodeGen/AMDGPU/trunc-bitcast-vector.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32: +; CHECK: buffer_load_dword v +; CHECK: buffer_store_dword v +define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { + %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in + %bc = bitcast <2 x i32> %ld to i64 + %trunc = trunc i64 %bc to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32: +; CHECK: buffer_load_dword v +; CHECK: buffer_store_dword v +define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) { + %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in + %bc = bitcast <3 x i32> %ld to i96 + %trunc = trunc i96 %bc to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32: +; CHECK: buffer_load_dword v +; CHECK: buffer_store_dword v +define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in + %bc = bitcast <4 x i32> %ld to i128 + %trunc = trunc i128 %bc to i32 + store i32 %trunc, i32 addrspace(1)* %out + ret void +} + +; Don't want load width reduced in this case. +; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: buffer_store_short [[VAL]] +define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { + %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in + %bc = bitcast <2 x i16> %ld to i32 + %trunc = trunc i32 %bc to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; FIXME: Don't want load width reduced here. +; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: +; CHECK: buffer_load_ushort [[VAL:v[0-9]+]] +; CHECK: buffer_store_short [[VAL]] +define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { + %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in + %bc = bitcast <4 x i16> %ld to i64 + %trunc = trunc i64 %bc to i16 + store i16 %trunc, i16 addrspace(1)* %out + ret void +} + +; FIXME: Don't want load width reduced in this case. +; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8: +; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]] +; CHECK: buffer_store_byte [[VAL]] +define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) { + %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in + %bc = bitcast <2 x i8> %ld to i16 + %trunc = trunc i16 %bc to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: buffer_store_byte [[VAL]] +define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) { + %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in + %bc = bitcast <4 x i8> %ld to i32 + %trunc = trunc i32 %bc to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: buffer_store_byte [[VAL]] +define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) { + %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in + %bc = bitcast <3 x i8> %ld to i24 + %trunc = trunc i24 %bc to i8 + store i8 %trunc, i8 addrspace(1)* %out + ret void +}