Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1640,11 +1640,31 @@ // // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: if (NumElements >= 8) return SplitVectorLoad(Op, DAG); // v4 loads are supported for private and global memory. return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + // Depending on the setting of the private_element_size field in the + // resource descriptor, we can only make private accesses up to a certain + // size. + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return ScalarizeVectorLoad(Op, DAG); + case 8: + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + return SDValue(); + case 16: + // Same as global/flat + if (NumElements >= 8) + return SplitVectorLoad(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } case AMDGPUAS::LOCAL_ADDRESS: // If properly aligned, if we split we might be able to use ds_read_b64. return SplitVectorLoad(Op, DAG); @@ -1854,21 +1874,35 @@ assert(Store->getValue().getValueType().getScalarType() == MVT::i32); - unsigned NElts = VT.getVectorNumElements(); - unsigned AS = Store->getAddressSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS) { + unsigned NumElements = VT.getVectorNumElements(); + switch (Store->getAddressSpace()) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements >= 8) + return SplitVectorStore(Op, DAG); + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return ScalarizeVectorStore(Op, DAG); + case 8: + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + return SDValue(); + case 16: + if (NumElements >= 8) + return SplitVectorStore(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } + case AMDGPUAS::LOCAL_ADDRESS: // If properly aligned, if we split we might be able to use ds_write_b64. return SplitVectorStore(Op, DAG); + default: + llvm_unreachable("unhandled address space"); } - - if (AS == AMDGPUAS::PRIVATE_ADDRESS && NElts > 4) - return ScalarizeVectorStore(Op, DAG); - - // These stores are legal. private, global and flat. - if (NElts >= 8) - return SplitVectorStore(Op, DAG); - - return SDValue(); } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -27,7 +27,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -18,7 +18,8 @@ ; VI-NEXT: s_mov_b32 s15, 0x980000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024