Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -28,7 +28,6 @@ protected: const AMDGPUSubtarget *Subtarget; -private: SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, const SDValue &InitPtr, SDValue Chain, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -671,12 +671,6 @@ // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; - case ISD::STORE: { - SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); - if (Lowered.getNode()) - Results.push_back(Lowered); - return; - } default: return; } @@ -1147,6 +1141,8 @@ return DAG.getMergeValues(Ops, SL); } +// FIXME: This isn't doing anything for SI. This should be used in a target +// combine during type legalization. SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast(Op); @@ -1291,66 +1287,6 @@ return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } -SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); - if (Result.getNode()) { - return Result; - } - - StoreSDNode *Store = cast(Op); - SDValue Chain = Store->getChain(); - if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && - Store->getValue().getValueType().isVector()) { - return SplitVectorStore(Op, DAG); - } - - EVT MemVT = Store->getMemoryVT(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && - MemVT.bitsLT(MVT::i32)) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue BasePtr = Store->getBasePtr(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, - DAG.getConstant(0x3, DL, MVT::i32)); - - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, - Store->getValue()); - - SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); - - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - - SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - } - return SDValue(); -} - // This is a shortcut for integer division because we have fast i32<->f32 // conversions, and fast f32 reciprocal instructions. The fractional part of a // float is enough to accurately represent up to a 24-bit integer. Index: lib/Target/AMDGPU/R600ISelLowering.h =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.h +++ lib/Target/AMDGPU/R600ISelLowering.h @@ -59,6 +59,8 @@ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1264,19 +1264,73 @@ } } +SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, + SelectionDAG &DAG) const { + SDLoc DL(Store); + + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + EVT MemVT = Store->getMemoryVT(); + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); + if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) + return Result; + StoreSDNode *StoreNode = cast(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Ptr = Op.getOperand(2); + unsigned AS = StoreNode->getAddressSpace(); + SDValue Value = StoreNode->getValue(); + EVT ValueVT = Value.getValueType(); - SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Result.getNode()) { - return Result; + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + ValueVT.isVector()) { + return SplitVectorStore(Op, DAG); } - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + SDLoc DL(Op); + SDValue Chain = StoreNode->getChain(); + SDValue Ptr = StoreNode->getBasePtr(); + + if (AS == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); @@ -1311,7 +1365,7 @@ Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - Value.getValueType().bitsGE(MVT::i32)) { + ValueVT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), @@ -1326,18 +1380,14 @@ } } - EVT ValueVT = Value.getValueType(); - - if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) { - return Ret; - } - // Lowering for indirect addressing + EVT MemVT = StoreNode->getMemoryVT(); + if (MemVT.bitsLT(MVT::i32)) + return lowerPrivateTruncStore(StoreNode, DAG); + // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); const AMDGPUFrameLowering *TFL = static_cast(Subtarget->getFrameLowering()); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1850,24 +1850,27 @@ StoreSDNode *Store = cast(Op); EVT VT = Store->getMemoryVT(); - // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - if (VT.isVector() && VT.getVectorNumElements() > 4) - return ScalarizeVectorStore(Op, DAG); - return SDValue(); + if (VT == MVT::i1) { + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) - return Ret; + assert(Store->getValue().getValueType().getScalarType() == MVT::i32); - if (VT.isVector() && VT.getVectorNumElements() >= 8) - return SplitVectorStore(Op, DAG); + unsigned NElts = VT.getVectorNumElements(); + unsigned AS = Store->getAddressSpace(); + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + // If properly aligned, if we split we might be able to use ds_write_b64. + return SplitVectorStore(Op, DAG); + } - if (VT == MVT::i1) - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + if (AS == AMDGPUAS::PRIVATE_ADDRESS && NElts > 4) + return ScalarizeVectorStore(Op, DAG); + + // These stores are legal. private, global and flat. + if (NElts >= 8) + return SplitVectorStore(Op, DAG); return SDValue(); }