Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2074,15 +2074,19 @@ bool IsModified = false; do { IsModified = false; + // Go over all selected nodes and try to fold them a bit more - for (SDNode &Node : CurDAG->allnodes()) { - MachineSDNode *MachineNode = dyn_cast(&Node); + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); + while (Position != CurDAG->allnodes_end()) { + SDNode *Node = &*Position++; + MachineSDNode *MachineNode = dyn_cast(Node); if (!MachineNode) continue; SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != &Node) { - ReplaceUses(&Node, ResNode); + if (ResNode != Node) { + if (ResNode) + ReplaceUses(Node, ResNode); IsModified = true; } } Index: lib/Target/AMDGPU/AMDGPUInstrInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -50,9 +50,9 @@ /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the - /// equivalent opcode that writes \p Channels Channels. - int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; + /// \brief Given a MIMG \p MI that writes any number of channels, return the + /// equivalent opcode that writes \p NewChannels Channels. + int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) const; }; } // End llvm namespace Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -56,15 +56,59 @@ return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } -int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { - switch (Channels) { - default: return Opcode; - case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); - case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); - case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); +static AMDGPU::Channels indexToChannel(unsigned Channel) { + switch (Channel) { + case 1: + return AMDGPU::Channels_1; + case 2: + return AMDGPU::Channels_2; + case 3: + return AMDGPU::Channels_3; + case 4: + return AMDGPU::Channels_4; + default: + llvm_unreachable("invalid MIMG channel"); } } +// FIXME: Need to handle d16 images correctly. +static unsigned rcToChannels(unsigned RCID) { + switch (RCID) { + case AMDGPU::VGPR_32RegClassID: + return 1; + case AMDGPU::VReg_64RegClassID: + return 2; + case AMDGPU::VReg_96RegClassID: + return 3; + case AMDGPU::VReg_128RegClassID: + return 4; + default: + llvm_unreachable("invalid MIMG register class"); + } +} + +int AMDGPUInstrInfo::getMaskedMIMGOp(unsigned Opc, + unsigned NewChannels) const { + AMDGPU::Channels Channel = indexToChannel(NewChannels); + unsigned OrigChannels = rcToChannels(get(Opc).OpInfo[0].RegClass); + if (NewChannels == OrigChannels) + return Opc; + + switch (OrigChannels) { + case 1: + return AMDGPU::getMaskedMIMGOp1(Opc, Channel); + case 2: + return AMDGPU::getMaskedMIMGOp2(Opc, Channel); + case 3: + return AMDGPU::getMaskedMIMGOp3(Opc, Channel); + case 4: + return AMDGPU::getMaskedMIMGOp4(Opc, Channel); + default: + llvm_unreachable("invalid MIMG channel"); + } +} + + // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -82,7 +82,7 @@ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; - void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6578,9 +6578,9 @@ } /// \brief Adjust the writemask of MIMG instructions -void SITargetLowering::adjustWritemask(MachineSDNode *&Node, - SelectionDAG &DAG) const { - SDNode *Users[4] = { }; +SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); @@ -6597,7 +6597,7 @@ // Abort if we can't understand the usage if (!I->isMachineOpcode() || I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) - return; + return Node; // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. // Note that subregs are packed, i.e. Lane==0 is the first bit set @@ -6615,7 +6615,7 @@ // Abort if we have more than one user per component if (Users[Lane]) - return; + return Node; Users[Lane] = *I; NewDmask |= 1 << Comp; @@ -6623,25 +6623,41 @@ // Abort if there's no change if (NewDmask == OldDmask) - return; + return Node; + + unsigned BitsSet = countPopulation(NewDmask); + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + int NewOpcode = TII->getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); + assert(NewOpcode != -1 && + NewOpcode != static_cast(Node->getMachineOpcode()) && + "failed to find equivalent MIMG op"); // Adjust the writemask in the node - std::vector Ops; + SmallVector Ops; Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); - - // If we only got one lane, replace it with a copy - // (if NewDmask has only one bit set...) - if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), - MVT::i32); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(), Users[Lane]->getValueType(0), - SDValue(Node, 0), RC); + + MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); + + auto NewVTList = + DAG.getVTList(BitsSet == 1 ? + SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet), + MVT::Other); + + MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node), + NewVTList, Ops); + // Update chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); + + if (BitsSet == 1) { + assert(Node->hasNUsesOfValue(1, 0)); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, + SDLoc(Node), Users[Lane]->getValueType(0), + SDValue(NewNode, 0)); DAG.ReplaceAllUsesWith(Users[Lane], Copy); - return; + return nullptr; } // Update the users of the node with the new indices @@ -6651,7 +6667,7 @@ continue; SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); switch (Idx) { default: break; @@ -6660,6 +6676,9 @@ case AMDGPU::sub2: Idx = AMDGPU::sub3; break; } } + + DAG.RemoveDeadNode(Node); + return nullptr; } static bool isFrameIndexOp(SDValue Op) { @@ -6717,14 +6736,16 @@ } /// \brief Fold the instructions after selecting them. +/// Returns null if users were already updated. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) - adjustWritemask(Node, DAG); + !TII->isGather4(Opcode)) { + return adjustWritemask(Node, DAG); + } if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) { @@ -6802,31 +6823,6 @@ return; } - if (TII->isMIMG(MI)) { - unsigned VReg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(VReg); - // TODO: Need mapping tables to handle other cases (register classes). - if (RC != &AMDGPU::VReg_128RegClass) - return; - - unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; - unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); - unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; - switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VGPR_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; - } - - unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); - MI.setDesc(TII->get(NewOpcode)); - MRI.setRegClass(VReg, RC); - return; - } - // Replace unused atomics with the no return version. int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1823,7 +1823,31 @@ let ValueCols = [["Default"]]; } -def getMaskedMIMGOp : InstrMapping { +def getMaskedMIMGOp1 : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["1"]; + let ValueCols = [["2"], ["3"], ["4"] ]; +} + +def getMaskedMIMGOp2 : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["2"]; + let ValueCols = [["1"], ["3"], ["4"] ]; +} + +def getMaskedMIMGOp3 : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["3"]; + let ValueCols = [["1"], ["2"], ["4"] ]; +} + +def getMaskedMIMGOp4 : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; let ColFields = ["Channels"]; Index: test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}adjust_writemask_crash_0: +; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2 +; GCN-NOT: v1 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define amdgpu_ps void @adjust_writemask_crash_0() #0 { +main_body: + %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <2 x float> %tmp to <2 x i32> + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 0 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}adjust_writemask_crash_1: +; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN-NOT: v1 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define amdgpu_ps void @adjust_writemask_crash_1() #0 { +main_body: + %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <2 x float> %tmp to <2 x i32> + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 1 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + +define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 { +main_body: + %tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp1 = bitcast <4 x float> %tmp to <4 x i32> + %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> + %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> + %tmp4 = extractelement <4 x float> %tmp3, i32 0 + store volatile float %tmp4, float addrspace(1)* undef + ret void +} + + +declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly }