Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -46,6 +46,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; SDNode *Select(SDNode *N) override; const char *getPassName() const override; + void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: @@ -465,62 +466,11 @@ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, N->getValueType(0), Ops); } - - case ISD::LOAD: { - LoadSDNode *LD = cast(N); - SDLoc SL(N); - EVT VT = N->getValueType(0); - - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { - N = glueCopyToM0(N); - break; - } - - // To simplify the TableGen patters, we replace all i64 loads with - // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 - // during DAG legalization, however, so places (ExpandUnalignedLoad) - // in the DAG legalizer assume that if i64 is legal, so doing this - // promotion early can cause problems. - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SDNode *Load = glueCopyToM0(NewLoad.getNode()); - SelectCode(Load); - N = BitCast.getNode(); - break; - } - + case ISD::LOAD: case ISD::STORE: { - // Handle i64 stores here for the same reason mentioned above for loads. - StoreSDNode *ST = cast(N); - SDValue Value = ST->getValue(); - if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); - - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); - } - - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); - } - N = glueCopyToM0(N); break; } - case AMDGPUISD::REGISTER_LOAD: { if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) break; @@ -1547,6 +1497,65 @@ return SelectVOP3Mods(In, Src, SrcMods); } +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + bool Modified = false; + + // XXX - Other targets seem to be able to do this without a worklist. + SmallVector LoadsToReplace; + SmallVector StoresToReplace; + + for (SDNode &Node : CurDAG->allnodes()) { + if (LoadSDNode *LD = dyn_cast(&Node)) { + EVT VT = LD->getValueType(0); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + continue; + + // To simplify the TableGen patters, we replace all i64 loads with v2i32 + // loads. Alternatively, we could promote i64 loads to v2i32 during DAG + // legalization, however, so places (ExpandUnalignedLoad) in the DAG + // legalizer assume that if i64 is legal, so doing this promotion early + // can cause problems. + LoadsToReplace.push_back(LD); + } else if (StoreSDNode *ST = dyn_cast(&Node)) { + // Handle i64 stores here for the same reason mentioned above for loads. + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + continue; + StoresToReplace.push_back(ST); + } + } + + for (LoadSDNode *LD : LoadsToReplace) { + SDLoc SL(LD); + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); + Modified = true; + } + + for (StoreSDNode *ST : StoresToReplace) { + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), + MVT::v2i32, ST->getValue()); + const SDValue StoreOps[] = { + ST->getChain(), + NewValue, + ST->getBasePtr(), + ST->getOffset() + }; + + CurDAG->UpdateNodeOperands(ST, StoreOps); + Modified = true; + } + + // XXX - Is this necessary? + if (Modified) + CurDAG->RemoveDeadNodes(); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); Index: test/CodeGen/AMDGPU/extract-vector-elt-i64.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/extract-vector-elt-i64.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; How the replacement of i64 stores with v2i32 stores resulted in +; breaking other users of the bitcast if they already existed + +; GCN-LABEL: {{^}}extract_vector_elt_select_error: +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_store_dwordx2 +define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) nounwind { + %vec = bitcast i64 %val to <2 x i32> + %elt0 = extractelement <2 x i32> %vec, i32 0 + %elt1 = extractelement <2 x i32> %vec, i32 1 + + store volatile i32 %elt0, i32 addrspace(1)* %out + store volatile i32 %elt1, i32 addrspace(1)* %out + store volatile i64 %val, i64 addrspace(1)* %in + ret void +}