Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1459,29 +1459,50 @@ } } + const AMDGPUTargetLowering *TLI + = static_cast(getTargetLowering()); + for (LoadSDNode *LD : LoadsToReplace) { SDLoc SL(LD); SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + + SDValue ReplaceLoad; + SDValue ReplaceChain; + if (LD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->getMaxPrivateElementSize() < 8) { + // XXX - For some reason the merge_values fails to select when emitted + // here. + SDValue Scalarized = TLI->ScalarizeVectorLoad(NewLoad, *CurDAG); + assert(Scalarized.getOpcode() == ISD::MERGE_VALUES); + ReplaceLoad = Scalarized.getOperand(0); + ReplaceChain = Scalarized.getOperand(1); + } else { + ReplaceLoad = NewLoad.getValue(0); + ReplaceChain = NewLoad.getValue(1); + } + + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, MVT::i64, ReplaceLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), ReplaceChain); CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); Modified = true; } for (StoreSDNode *ST : StoresToReplace) { - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), - MVT::v2i32, ST->getValue()); - const SDValue StoreOps[] = { - ST->getChain(), - NewValue, - ST->getBasePtr(), - ST->getOffset() - }; + SDLoc SL(ST); + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SL, MVT::v2i32, + ST->getValue()); + + SDValue NewStore = CurDAG->getStore(ST->getChain(), SL, NewValue, + ST->getBasePtr(), ST->getMemOperand()); + + if (ST->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && + Subtarget->getMaxPrivateElementSize() < 8) { + NewStore = TLI->ScalarizeVectorStore(NewStore, *CurDAG); + } - CurDAG->UpdateNodeOperands(ST, StoreOps); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(ST, 0), NewStore); Modified = true; } Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -89,6 +89,7 @@ SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; +public: /// \brief Split a vector load into a scalar load of each component. SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; @@ -101,6 +102,7 @@ /// \brief Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; +protected: SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -80,7 +80,6 @@ setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); @@ -1602,6 +1601,9 @@ ISD::LoadExtType ExtType = Load->getExtensionType(); EVT MemVT = Load->getMemoryVT(); + if (MemVT == MVT::i32) + return SDValue(); + if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC @@ -1622,13 +1624,9 @@ return DAG.getMergeValues(Ops, DL); } - if (!MemVT.isVector()) - return SDValue(); - assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); unsigned NumElements = MemVT.getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); switch (Load->getAddressSpace()) { case AMDGPUAS::CONSTANT_ADDRESS: @@ -1866,6 +1864,9 @@ StoreSDNode *Store = cast(Op); EVT VT = Store->getMemoryVT(); + if (VT == MVT::i32) + return SDValue(); + if (VT == MVT::i1) { return DAG.getTruncStore(Store->getChain(), DL, DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), Index: test/CodeGen/AMDGPU/private-element-size.ll =================================================================== --- test/CodeGen/AMDGPU/private-element-size.ll +++ test/CodeGen/AMDGPU/private-element-size.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL -check-prefix=HSA_ELTGE8 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL -check-prefix=HSA-ELTGE8 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s @@ -48,8 +48,7 @@ %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1 store <4 x i32> zeroinitializer, <4 x i32>* %gep0 store <4 x i32> , <4 x i32>* %gep1 - %idxprom2 = sext i32 %index to i64 - %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i64 0, i64 %idxprom2 + %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 %index %load = load <4 x i32>, <4 x i32>* %gep2 store <4 x i32> %load, <4 x i32> addrspace(1)* %out ret void @@ -115,13 +114,135 @@ %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1 store <8 x i32> zeroinitializer, <8 x i32>* %gep0 store <8 x i32> , <8 x i32>* %gep1 - %idxprom2 = sext i32 %index to i64 - %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i64 0, i64 %idxprom2 + %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 %index %load = load <8 x i32>, <8 x i32>* %gep2 store <8 x i32> %load, <8 x i32> addrspace(1)* %out ret void } + +; ALL-LABEL: {{^}}private_elt_size_i64: +; HSA-ELT16: private_element_size = 3 +; HSA-ELT8: private_element_size = 2 +; HSA-ELT4: private_element_size = 1 + +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8 + +; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}} + +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom + %index.load = load i32, i32 addrspace(1)* %gep.index + %index = and i32 %index.load, 2 + %alloca = alloca [2 x i64], align 16 + %gep0 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 1 + store i64 0, i64* %gep0 + store i64 34359738602, i64* %gep1 + %gep2 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 %index + %load = load i64, i64* %gep2 + store i64 %load, i64 addrspace(1)* %out + ret void +} + + +; ALL-LABEL: {{^}}private_elt_size_f64: +; HSA-ELT16: private_element_size = 3 +; HSA-ELT8: private_element_size = 2 +; HSA-ELT4: private_element_size = 1 + +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8 + +; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}} + +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom + %index.load = load i32, i32 addrspace(1)* %gep.index + %index = and i32 %index.load, 2 + %alloca = alloca [2 x double], align 16 + %gep0 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 1 + store double 0.0, double* %gep0 + store double 4.0, double* %gep1 + %gep2 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 %index + %load = load double, double* %gep2 + store double %load, double addrspace(1)* %out + ret void +} + +; ALL-LABEL: {{^}}private_elt_size_v2i64: +; HSA-ELT16: private_element_size = 3 +; HSA-ELT8: private_element_size = 2 +; HSA-ELT4: private_element_size = 1 + +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16 +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:28{{$}} + +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom + %index.load = load i32, i32 addrspace(1)* %gep.index + %index = and i32 %index.load, 2 + %alloca = alloca [2 x <2 x i64>], align 16 + %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 0 + %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 1 + store <2 x i64> zeroinitializer, <2 x i64>* %gep0 + store <2 x i64> , <2 x i64>* %gep1 + %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 %index + %load = load <2 x i64>, <2 x i64>* %gep2 + store <2 x i64> %load, <2 x i64> addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind }