Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -64,6 +64,8 @@ SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -358,6 +358,7 @@ setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); @@ -1852,8 +1853,58 @@ return DAG.getConstant(Src0 >> Offset, MVT::i32); } +static bool usesAllNormalStores(SDNode *LoadVal) { + for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { + if (!ISD::isNormalStore(*I)) + return false; + } + + return true; +} + +// If we have a copy of an illegal type, replace it with a load / store of an +// equivalently sized legal type. This avoids intermediate bit pack / unpack +// instructions emitted when handling extloads and truncstores. Ideally we could +// recognize the pack / unpack pattern to eliminate it. +SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *SN = cast(N); + SDValue Value = SN->getValue(); + EVT VT = Value.getValueType(); + + if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) + return SDValue(); + + LoadSDNode *LoadVal = cast(Value); + if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + return SDValue(); + + EVT MemVT = LoadVal->getMemoryVT(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); + + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + LoadVT, SL, + LoadVal->getChain(), + LoadVal->getBasePtr(), + LoadVal->getOffset(), + LoadVT, + LoadVal->getMemOperand()); + + SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); + DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + + return DAG.getStore(SN->getChain(), SL, NewLoad, + SN->getBasePtr(), SN->getMemOperand()); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); @@ -1977,6 +2028,9 @@ break; } + + case ISD::STORE: + return performStoreCombine(N, DCI); } return SDValue(); } Index: test/CodeGen/R600/copy-illegal-type.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/copy-illegal-type.ll @@ -0,0 +1,166 @@ +; RUN: llc -march=r600 -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: @test_copy_v4i8 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x3 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x4 +; SI: BUFFER_LOAD_DWORD [[REG:v[0-9]+]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: BUFFER_STORE_DWORD [[REG]] +; SI: S_ENDPGM +define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; After scalarizing v4i8 loads is fixed. +; XSI: BUFFER_LOAD_DWORD +; XSI: V_BFE +; XSI: V_ADD +; XSI: V_ADD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_x2_extra_use +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: V_ADD +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI-DAG: BUFFER_STORE_BYTE +; SI_DAG: BUFFER_STORE_BYTE + +; XSI: BUFFER_LOAD_DWORD +; XSI: BFE +; XSI: BUFFER_STORE_DWORD +; XSI: V_ADD +; XSI: BUFFER_STORE_DWORD +; XSI-NEXT: BUFFER_STORE_DWORD + +; SI: S_ENDPGM +define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + %add = add <4 x i8> %val, + store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 + store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v3i8 +; SI-NOT: BFE +; SI-NOT: BFI +; SI: S_ENDPGM +define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8> addrspace(1)* %in, align 4 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_load +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load volatile <4 x i8> addrspace(1)* %in, align 4 + store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_copy_v4i8_volatile_store +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_LOAD_UBYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: BUFFER_STORE_BYTE +; SI: S_ENDPGM +define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { + %val = load <4 x i8> addrspace(1)* %in, align 4 + store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/R600/indirect-private-64.ll =================================================================== --- test/CodeGen/R600/indirect-private-64.ll +++ test/CodeGen/R600/indirect-private-64.ll @@ -17,10 +17,14 @@ } ; SI-LABEL: @private_access_v2f64_alloca: -; SI: DS_WRITE_B64 -; SI: DS_WRITE_B64 -; SI: DS_READ_B64 -; SI: DS_READ_B64 +; SI: DS_WRITE_B32 +; SI: DS_WRITE_B32 +; SI: DS_WRITE_B32 +; SI: DS_WRITE_B32 +; SI: DS_READ_B32 +; SI: DS_READ_B32 +; SI: DS_READ_B32 +; SI: DS_READ_B32 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 @@ -47,10 +51,14 @@ } ; SI-LABEL: @private_access_v2i64_alloca: -; SI: DS_WRITE_B64 -; SI: DS_WRITE_B64 -; SI: DS_READ_B64 -; SI: DS_READ_B64 +; SI: DS_WRITE_B32 +; SI: DS_WRITE_B32 +; SI: DS_WRITE_B32 +; SI: DS_WRITE_B32 +; SI: DS_READ_B32 +; SI: DS_READ_B32 +; SI: DS_READ_B32 +; SI: DS_READ_B32 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 Index: test/CodeGen/R600/load.ll =================================================================== --- test/CodeGen/R600/load.ll +++ test/CodeGen/R600/load.ll @@ -254,8 +254,8 @@ ; load a v2f32 value from the global address space ; FUNC-LABEL: @load_v2f32 +; R600-CHECK: MEM_RAT ; R600-CHECK: VTX_READ_64 - ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) { entry: @@ -265,9 +265,7 @@ } ; FUNC-LABEL: @load_i64 -; R600-CHECK: MEM_RAT -; R600-CHECK: MEM_RAT - +; R600-CHECK: VTX_READ_64 ; SI-CHECK: BUFFER_LOAD_DWORDX2 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { entry: