Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -330,6 +330,7 @@ INTERP_P2, PC_ADD_REL_OFFSET, KILL, + DUMMY_CHAIN, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3278,6 +3278,7 @@ NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(KILL) + NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(SENDMSGHALT) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -54,6 +54,9 @@ // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; +// Force dependencies for vector trunc stores +def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>; + def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>; def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>; Index: llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1120,7 +1120,10 @@ llvm_unreachable("Unsupported private trunc store"); } - SDValue Chain = Store->getChain(); + SDValue OldChain = Store->getChain(); + bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN); + // Skip dummy + SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain; SDValue BasePtr = Store->getBasePtr(); SDValue Offset = Store->getOffset(); EVT MemVT = Store->getMemoryVT(); @@ -1176,7 +1179,15 @@ // Store dword // TODO: Can we be smarter about MachinePointerInfo? - return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo()); + SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo()); + + // If we are part of expanded vector, make our neighbors depend on this store + if (VectorTrunc) { + // Make all other vector elements depend on this store + Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore); + DAG.ReplaceAllUsesOfValueWith(OldChain, Chain); + } + return NewStore; } SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -1196,6 +1207,17 @@ // Neither LOCAL nor PRIVATE can do vectors at the moment if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && VT.isVector()) { + if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) { + // Add an extra level of chain to isolate this vector + SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); + // TODO: can the chain be replaced without creating a new store? + SDValue NewStore = DAG.getTruncStore( + NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), + MemVT, StoreNode->getAlignment(), + StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo()); + StoreNode = cast(NewStore); + } + return scalarizeVectorStore(StoreNode, DAG); } @@ -1230,7 +1252,7 @@ // Put the mask in correct place SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift); - // Put the mask in correct place + // Put the value bits in correct place SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift); Index: llvm/trunk/lib/Target/AMDGPU/R600Instructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/R600Instructions.td +++ llvm/trunk/lib/Target/AMDGPU/R600Instructions.td @@ -727,6 +727,20 @@ def MOV : R600_1OP <0x19, "MOV", []>; + +// This is a hack to get rid of DUMMY_CHAIN nodes. +// Most DUMMY_CHAINs should be eliminated during legalization, but undef +// values can sneak in some to selection. +let isPseudo = 1, isCodeGenOnly = 1 in { +def DUMMY_CHAIN : AMDGPUInst < + (outs), + (ins), + "DUMMY_CHAIN", + [(R600dummy_chain)] +>; +} // end let isPseudo = 1, isCodeGenOnly = 1 + + let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { class MOV_IMM : AMDGPUInst < Index: llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-local-i8.ll @@ -708,10 +708,11 @@ ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16: ; EG: LDS_READ_RET +; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR +; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -; EG-DAG: ASHR ; EG: LDS_WRITE ; EG: LDS_WRITE define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { @@ -740,14 +741,15 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET +; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE @@ -786,6 +788,11 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET +; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT @@ -798,10 +805,6 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE @@ -860,6 +863,11 @@ ; EG: LDS_READ_RET ; EG: LDS_READ_RET ; EG: LDS_READ_RET +; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT +; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT @@ -884,14 +892,6 @@ ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT ; EG-DAG: BFE_INT -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR -; EG-DAG: ASHR ; EG: LDS_WRITE ; EG: LDS_WRITE ; EG: LDS_WRITE