Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6914,8 +6914,11 @@ SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; + + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { SDLoc SL(N); @@ -6956,6 +6959,44 @@ Vec.getOperand(1), Idx)); } } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit + // elements. This exposes more load reduction opportunities by replacing + // multiple small extract_vector_elements with a single 32-bit extract. + auto *Idx = dyn_cast(N->getOperand(1)); + if (EltSize <= 16 && + EltVT.isByteSized() && + VecSize > 32 && + VecSize % 32 == 0 && + Idx) { + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); + + unsigned BitIndex = Idx->getZExtValue() * EltSize; + unsigned EltIdx = BitIndex / 32; + unsigned LeftoverBitIdx = BitIndex % 32; + SDLoc SL(N); + + SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); + DCI.AddToWorklist(Cast.getNode()); + + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, + DAG.getConstant(EltIdx, SL, MVT::i32)); + DCI.AddToWorklist(Elt.getNode()); + SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, + DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); + DCI.AddToWorklist(Srl.getNode()); + + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl); + DCI.AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc); + } + return SDValue(); } Index: test/CodeGen/AMDGPU/extract_vector_elt-f16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -141,6 +141,36 @@ ret void } +; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01: +; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(<16 x half> addrspace(4)* %ptr) #0 { + %load = load <16 x half>, <16 x half> addrspace(4)* %ptr + %elt0 = extractelement <16 x half> %load, i32 0 + %elt1 = extractelement <16 x half> %load, i32 1 + store volatile half %elt0, half addrspace(1)* undef, align 2 + store volatile half %elt1, half addrspace(1)* undef, align 2 + ret void +} + +; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23: +; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(<16 x half> addrspace(4)* %ptr) #0 { + %load = load <16 x half>, <16 x half> addrspace(4)* %ptr + %elt2 = extractelement <16 x half> %load, i32 2 + %elt3 = extractelement <16 x half> %load, i32 3 + store volatile half %elt2, half addrspace(1)* undef, align 2 + store volatile half %elt3, half addrspace(1)* undef, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/extract_vector_elt-i16.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -142,6 +142,36 @@ ret void } +; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01: +; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(<16 x i16> addrspace(4)* %ptr) #0 { + %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr + %elt0 = extractelement <16 x i16> %load, i32 0 + %elt1 = extractelement <16 x i16> %load, i32 1 + store volatile i16 %elt0, i16 addrspace(1)* undef, align 2 + store volatile i16 %elt1, i16 addrspace(1)* undef, align 2 + ret void +} + +; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23: +; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(<16 x i16> addrspace(4)* %ptr) #0 { + %load = load <16 x i16>, <16 x i16> addrspace(4)* %ptr + %elt2 = extractelement <16 x i16> %load, i32 2 + %elt3 = extractelement <16 x i16> %load, i32 3 + store volatile i16 %elt2, i16 addrspace(1)* undef, align 2 + store volatile i16 %elt3, i16 addrspace(1)* undef, align 2 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/extract_vector_elt-i8.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -199,4 +199,78 @@ ret void } +; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123: +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dword s +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24 +define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* null + %elt0 = extractelement <8 x i8> %load, i32 0 + %elt1 = extractelement <8 x i8> %load, i32 1 + %elt2 = extractelement <8 x i8> %load, i32 2 + %elt3 = extractelement <8 x i8> %load, i32 3 + store volatile i8 %elt0, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt1, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt2, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt3, i8 addrspace(1)* undef, align 1 + ret void +} + +; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145: +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dwordx2 +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* null + %elt0 = extractelement <8 x i8> %load, i32 0 + %elt1 = extractelement <8 x i8> %load, i32 1 + %elt4 = extractelement <8 x i8> %load, i32 4 + %elt5 = extractelement <8 x i8> %load, i32 5 + store volatile i8 %elt0, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt1, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt4, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt5, i8 addrspace(1)* undef, align 1 + ret void +} + +; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45: +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}} +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}} +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { + %load = load <8 x i8>, <8 x i8> addrspace(4)* null + %elt4 = extractelement <8 x i8> %load, i32 4 + %elt5 = extractelement <8 x i8> %load, i32 5 + store volatile i8 %elt4, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt5, i8 addrspace(1)* undef, align 1 + ret void +} + +; FIXME: ought to be able to eliminate high half of load +; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145: +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_load_dwordx4 +; GCN-NOT: {{s|buffer|flat|global}}_load_ +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { + %load = load <16 x i8>, <16 x i8> addrspace(4)* null + %elt0 = extractelement <16 x i8> %load, i32 0 + %elt1 = extractelement <16 x i8> %load, i32 1 + %elt4 = extractelement <16 x i8> %load, i32 4 + %elt5 = extractelement <16 x i8> %load, i32 5 + store volatile i8 %elt0, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt1, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt4, i8 addrspace(1)* undef, align 1 + store volatile i8 %elt5, i8 addrspace(1)* undef, align 1 + ret void +} + attributes #0 = { nounwind }