Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1649,11 +1649,31 @@ // // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - if (NumElements >= 8) + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) return SplitVectorLoad(Op, DAG); // v4 loads are supported for private and global memory. return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + // Depending on the setting of the private_element_size field in the + // resource descriptor, we can only make private accesses up to a certain + // size. + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return ScalarizeVectorLoad(Op, DAG); + case 8: + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + return SDValue(); + case 16: + // Same as global/flat + if (NumElements > 4) + return SplitVectorLoad(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } case AMDGPUAS::LOCAL_ADDRESS: // If properly aligned, if we split we might be able to use ds_read_b64. return SplitVectorLoad(Op, DAG); @@ -1863,21 +1883,35 @@ assert(Store->getValue().getValueType().getScalarType() == MVT::i32); - unsigned NElts = VT.getVectorNumElements(); - unsigned AS = Store->getAddressSpace(); - if (AS == AMDGPUAS::LOCAL_ADDRESS) { + unsigned NumElements = VT.getVectorNumElements(); + switch (Store->getAddressSpace()) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return ScalarizeVectorStore(Op, DAG); + case 8: + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + return SDValue(); + case 16: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } + case AMDGPUAS::LOCAL_ADDRESS: // If properly aligned, if we split we might be able to use ds_write_b64. return SplitVectorStore(Op, DAG); + default: + llvm_unreachable("unhandled address space"); } - - if (AS == AMDGPUAS::PRIVATE_ADDRESS && NElts > 4) - return ScalarizeVectorStore(Op, DAG); - - // These stores are legal. private, global and flat. - if (NElts >= 8) - return SplitVectorStore(Op, DAG); - - return SDValue(); } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { Index: test/CodeGen/AMDGPU/private-element-size.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/private-element-size.ll @@ -0,0 +1,128 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s + + +; ALL-LABEL: {{^}}private_elt_size_v4i32: + +; HSA-ELT16: private_element_size = 3 +; HSA-ELT8: private_element_size = 2 +; HSA-ELT4: private_element_size = 1 + + +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16 +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:28{{$}} + +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom + %index.load = load i32, i32 addrspace(1)* %gep.index + %index = and i32 %index.load, 2 + %alloca = alloca [2 x <4 x i32>], align 16 + %gep0 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1 + store <4 x i32> zeroinitializer, <4 x i32>* %gep0 + store <4 x i32> , <4 x i32>* %gep1 + %idxprom2 = sext i32 %index to i64 + %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i64 0, i64 %idxprom2 + %load = load <4 x i32>, <4 x i32>* %gep2 + store <4 x i32> %load, <4 x i32> addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}private_elt_size_v8i32: +; HSA-ELT16: private_element_size = 3 +; HSA-ELT8: private_element_size = 2 +; HSA-ELT4: private_element_size = 1 + +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:32 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:48 + +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} + + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:48 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:56 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:44{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:48{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:52{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:56{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:60{{$}} + +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom + %index.load = load i32, i32 addrspace(1)* %gep.index + %index = and i32 %index.load, 2 + %alloca = alloca [2 x <8 x i32>], align 16 + %gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1 + store <8 x i32> zeroinitializer, <8 x i32>* %gep0 + store <8 x i32> , <8 x i32>* %gep1 + %idxprom2 = sext i32 %index to i64 + %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i64 0, i64 %idxprom2 + %load = load <8 x i32>, <8 x i32>* %gep2 + store <8 x i32> %load, <8 x i32> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -29,7 +29,7 @@ ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -18,7 +18,8 @@ ; VI-NEXT: s_mov_b32 s15, 0x980000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024