Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2452,6 +2452,34 @@ break; case ISD::BITCAST: { EVT DestVT = N->getValueType(0); + + // Push casts through vector builds. This helps avoid emitting a large + // number of copies when materializing floating point vector constants. + // + // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) => + // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) + if (DestVT.isVector()) { + SDValue Src = N->getOperand(0); + if (Src.getOpcode() == ISD::BUILD_VECTOR) { + EVT SrcVT = Src.getValueType(); + unsigned NElts = DestVT.getVectorNumElements(); + + if (SrcVT.getVectorNumElements() == NElts) { + EVT DestEltVT = DestVT.getVectorElementType(); + EVT SrcEltVT = SrcVT.getVectorElementType(); + + SmallVector CastedElts; + SDLoc SL(N); + for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) { + SDValue Elt = Src.getOperand(I); + CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt)); + } + + return DAG.getBuildVector(DestVT, SL, CastedElts); + } + } + } + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) break; Index: test/CodeGen/AMDGPU/bitcast-vector-extract.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/bitcast-vector-extract.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; The bitcast should be pushed through the bitcasts so the vectors can +; be broken down and the shared components can be CSEd + +; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) { + %vec0.bc = bitcast <8 x i32> to <8 x float> + store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out + + %vec1.bc = bitcast <8 x i32> to <8 x float> + store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) { + %vec0.bc = bitcast <4 x i64> to <8 x float> + store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out + + %vec1.bc = bitcast <4 x i64> to <8 x float> + store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) { + %vec0.bc = bitcast <4 x i64> to <4 x double> + store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out + + %vec1.bc = bitcast <4 x i64> to <4 x double> + store volatile <4 x double> %vec1.bc, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16: +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +; GCN-NOT: v_mov_b32 +; GCN: buffer_store_dwordx4 +define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) { + %vec0.bc = bitcast <16 x i16> to <8 x float> + store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out + + %vec1.bc = bitcast <16 x i16> to <8 x float> + store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out + ret void +}