Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -559,6 +559,10 @@ if ((C = dyn_cast(Addr))) { Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && + (C = dyn_cast(Addr.getOperand(0)))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast(Addr.getOperand(1)))) { Base = Addr.getOperand(0); Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1093,43 +1093,68 @@ Mask = 0xff; } else if (Store->getMemoryVT() == MVT::i16) { Mask = 0xffff; + } else { + llvm_unreachable("Unsupported private trunc store"); } SDValue Chain = Store->getChain(); SDValue BasePtr = Store->getBasePtr(); + SDValue Offset = Store->getOffset(); EVT MemVT = Store->getMemoryVT(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); + SDValue LoadPtr = BasePtr; + if (!Offset.isUndef()) { + LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); + } + + // Get dword location + // TODO: this should be eliminated by the future SHR ptr, 2 + SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, + DAG.getConstant(0xfffffffc, DL, MVT::i32)); + + // Load dword + // TODO: can we be smarter about machine pointer info? + SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); + + Chain = Dst.getValue(1); - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + // Get offset in dword + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); + // Convert byte offset to bit shift SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); + // TODO: is this necessary? truncating store should be < value, + // so this should be eliminated by the next Zero extend SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Store->getValue()); + // Mask the value to the right type SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + // Shift the value in place SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, MaskedValue, ShiftAmt); + // Shift the mask in place SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, DL, MVT::i32), ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); + // Invert the mask. NOTE: if we had native ROL instructions we could + // use inverted mask + DstMask = DAG.getNOT(DL, DstMask, MVT::i32); + + // Cleanup the target bits Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + // Add the new bits SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); + + // Store dword + // TODO: Can we be smarter about MachinePointerInfo? + return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo()); } SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -1217,43 +1242,12 @@ if (MemVT.bitsLT(MVT::i32)) return lowerPrivateTruncStore(StoreNode, DAG); - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (ValueVT.isVector()) { - unsigned NumElemVT = ValueVT.getVectorNumElements(); - EVT ElemVT = ValueVT.getVectorElementType(); - SmallVector Stores(NumElemVT); - - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, - Value, DAG.getConstant(i, DL, MVT::i32)); - - Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Elem, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32)); - } - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); - } else { - if (ValueVT == MVT::i8) { - Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); - } - Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); // Channel + if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); + return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); } - - return Chain; + return SDValue(); } // return (512 + (kc_bank << 12) @@ -1304,50 +1298,48 @@ ISD::LoadExtType ExtType = Load->getExtensionType(); EVT MemVT = Load->getMemoryVT(); - // getBasePtr(); + SDValue Chain = Load->getChain(); + SDValue Offset = Load->getOffset(); - // Get Register holding the target. - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), - Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); + SDValue LoadPtr = BasePtr; + if (!Offset.isUndef()) { + LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset); + } + + // Get dword location + // NOTE: this should be eliminated by the future SHR ptr, 2 + SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr, + DAG.getConstant(0xfffffffc, DL, MVT::i32)); + + // Load dword + // TODO: can we be smarter about machine pointer info? + SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); // Get offset within the register. SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); + LoadPtr, DAG.getConstant(0x3, DL, MVT::i32)); // Bit offset of target byte (byteIdx * 8). SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, DAG.getConstant(3, DL, MVT::i32)); // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt); // Eliminate the upper bits by setting them to ... EVT MemEltVT = MemVT.getScalarType(); - // ... ones. - if (ExtType == ISD::SEXTLOAD) { + if (ExtType == ISD::SEXTLOAD) { // ... ones. SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); + Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode); + } else { // ... or zeros. + Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT); } - // ... or zeros. SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() + Ret, + Read.getValue(1) // This should be our output chain }; return DAG.getMergeValues(Ops, DL); @@ -1376,16 +1368,12 @@ SDValue Chain = LoadNode->getChain(); SDValue Ptr = LoadNode->getBasePtr(); - if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { - SDValue MergedValues[2] = { - scalarizeVectorLoad(LoadNode, DAG), - Chain - }; - return DAG.getMergeValues(MergedValues, DL); + if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && + VT.isVector()) { + return scalarizeVectorLoad(LoadNode, DAG); } - SDValue LoweredLoad; - // For most operations returning SDValue() will result in the node being // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we // need to manually expand loads that may be legal in some address spaces and @@ -1410,47 +1398,14 @@ return SDValue(); } - // Lowering for indirect addressing - const MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); - unsigned StackWidth = TFL->getStackWidth(MF); - - Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); - - if (VT.isVector()) { - unsigned NumElemVT = VT.getVectorNumElements(); - EVT ElemVT = VT.getVectorElementType(); - SDValue Loads[4]; - - assert(NumElemVT <= 4); - assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " - "vector width in load"); - - for (unsigned i = 0; i < NumElemVT; ++i) { - unsigned Channel, PtrIncr; - getStackAddress(StackWidth, i, Channel, PtrIncr); - Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, - DAG.getConstant(PtrIncr, DL, MVT::i32)); - Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, - Chain, Ptr, - DAG.getTargetConstant(Channel, DL, MVT::i32), - Op.getOperand(2)); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); - LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); - } else { - LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), // Channel - Op.getOperand(2)); + // DWORDADDR ISD marks already shifted address + if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) { + assert(VT == MVT::i32 || (LoadNode->dumpr(),0)); + Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32)); + Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr); + return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand()); } - - SDValue Ops[2] = { - LoweredLoad, - Chain - }; - - return DAG.getMergeValues(Ops, DL); + return SDValue(); } SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Index: lib/Target/AMDGPU/R600Instructions.td =================================================================== --- lib/Target/AMDGPU/R600Instructions.td +++ lib/Target/AMDGPU/R600Instructions.td @@ -1291,6 +1291,17 @@ defm R600_ : RegisterLoadStore ; +// Hardcode channel to 0 +// NOTE: LSHR is not available here. LSHR is per family instruction +def : Pat < + (i32 (load_private ADDRIndirect:$addr) ), + (R600_RegisterLoad FRAMEri:$addr, (i32 0)) +>; +def : Pat < + (store_private i32:$val, ADDRIndirect:$addr), + (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0)) +>; + //===----------------------------------------------------------------------===// // Pseudo instructions Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -137,8 +137,8 @@ ; v2i16 is naturally 4 byte aligned ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal +; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal +; EG: 16 ; EG: 16 define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in @@ -184,19 +184,19 @@ ret void } -; FUNC-LABEL: {{^}}constant_constant_sextload_v3i16_to_v3i32: +; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32: ; GCN: s_load_dwordx2 ; v3i16 is naturally 8 byte aligned ; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1 ; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1 ; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal +; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_LO]], 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 -define void @constant_constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { +define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in %ext = sext <3 x i16> %ld to <3 x i32> Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -146,8 +146,7 @@ ; GCN-HSA: flat_load_dword ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 -; TODO: This should use DST, but for some there are redundant MOVs -; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal +; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal ; EG: 16 define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -175,7 +174,7 @@ ret void } -; FUNC-LABEL: {{^}}global_global_zextload_v3i16_to_v3i32: +; FUNC-LABEL: {{^}}global_zextload_v3i16_to_v3i32: ; GCN-NOHSA: buffer_load_dwordx2 ; GCN-HSA: flat_load_dwordx2 @@ -184,7 +183,7 @@ ; TODO: This should use DST, but for some there are redundant MOVs ; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG: 16 -define void @global_global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { +define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) { entry: %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in %ext = zext <3 x i16> %ld to <3 x i32> @@ -199,9 +198,9 @@ ; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1 ; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1 ; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal +; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal +; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_LO]], 0.0, literal ; EG-DAG: 16 ; EG-DAG: 16 define void @global_global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {