Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4063,6 +4063,24 @@ } } + // Handle a case like bitcast v12i8 -> v3i32. Normally that would get widened + // to v16i8 -> v4i32, but for a target where v3i32 is legal but v12i8 is not, + // we end up here. Handling the case here with EXTRACT_SUBVECTOR avoids + // having to copy via memory. + if (VT.isVector()) { + EVT EltVT = VT.getVectorElementType(); + unsigned EltSize = EltVT.getSizeInBits(); + if (InWidenSize % EltSize == 0) { + unsigned NewNumElts = InWidenSize / EltSize; + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts); + if (TLI.isTypeLegal(NewVT)) { + SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + } + } + } + return CreateStackStoreLoad(InOp, VT); } Index: test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 <%s -stop-after=amdgpu-isel | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: body: +; GCN-NOT: BUFFER_STORE{{.*}}store{{.*}}into{{.*}}stack +; GCN: S_ENDPGM + +define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg noalias dereferenceable(18446744073709551615) %arg) { +main_body: + %tmp25 = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> undef, i32 undef, i32 0, i32 0) + %tmp26 = shufflevector <3 x float> %tmp25, <3 x float> undef, <4 x i32> + %tmp27 = bitcast <4 x float> %tmp26 to <16 x i8> + %tmp28 = shufflevector <16 x i8> %tmp27, <16 x i8> undef, <12 x i32> + %tmp29 = bitcast <12 x i8> %tmp28 to <3 x i32> + %tmp30 = extractelement <3 x i32> %tmp29, i32 0 + %tmp31 = extractelement <3 x i32> %tmp29, i32 1 + %tmp32 = extractelement <3 x i32> %tmp29, i32 2 + %tmp33 = call i32 @llvm.bitreverse.i32(i32 %tmp30) #5 + %tmp34 = call i32 @llvm.bitreverse.i32(i32 %tmp31) #5 + %tmp35 = call i32 @llvm.bitreverse.i32(i32 %tmp32) #5 + %tmp36 = insertelement <2 x i32> undef, i32 %tmp33, i32 0 + %tmp37 = insertelement <2 x i32> %tmp36, i32 %tmp34, i32 1 + %tmp38 = getelementptr [0 x i8], [0 x i8] addrspace(6)* %arg, i32 0, i32 16 + %tmp39 = bitcast i8 addrspace(6)* %tmp38 to <4 x i32> addrspace(6)* + %tmp40 = load <4 x i32>, <4 x i32> addrspace(6)* %tmp39, align 16 + %tmp41 = bitcast <2 x i32> %tmp37 to <2 x float> + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %tmp41, <4 x i32> undef, i32 undef, i32 0, i32 0) #3 + %tmp43 = bitcast i32 %tmp35 to float + call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp43, <4 x i32> undef, i32 undef, i32 0, i32 0) #3 + ret void +} + +declare i32 @llvm.bitreverse.i32(i32) +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32 immarg) +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) +declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32 immarg) +