Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2074,15 +2074,19 @@
   bool IsModified = false;
   do {
     IsModified = false;
+
     // Go over all selected nodes and try to fold them a bit more
-    for (SDNode &Node : CurDAG->allnodes()) {
-      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node);
+    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
+    while (Position != CurDAG->allnodes_end()) {
+      SDNode *Node = &*Position++;
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
       if (!MachineNode)
         continue;
 
       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-      if (ResNode != &Node) {
-        ReplaceUses(&Node, ResNode);
+      if (ResNode != Node) {
+        if (ResNode)
+          ReplaceUses(Node, ResNode);
         IsModified = true;
       }
     }
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,9 +50,9 @@
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
 
-  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
-  /// equivalent opcode that writes \p Channels Channels.
-  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+  /// \brief Given a MIMG \p MI that writes any number of channels, return the
+  /// equivalent opcode that writes \p NewChannels Channels.
+  int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) const;
 };
 } // End llvm namespace
 
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -56,15 +56,59 @@
   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
 }
 
-int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
-  switch (Channels) {
-  default: return Opcode;
-  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
-  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
-  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+static AMDGPU::Channels indexToChannel(unsigned Channel) {
+  switch (Channel) {
+  case 1:
+    return AMDGPU::Channels_1;
+  case 2:
+    return AMDGPU::Channels_2;
+  case 3:
+    return AMDGPU::Channels_3;
+  case 4:
+    return AMDGPU::Channels_4;
+  default:
+    llvm_unreachable("invalid MIMG channel");
   }
 }
 
+// FIXME: Need to handle d16 images correctly.
+static unsigned rcToChannels(unsigned RCID) {
+  switch (RCID) {
+  case AMDGPU::VGPR_32RegClassID:
+    return 1;
+  case AMDGPU::VReg_64RegClassID:
+    return 2;
+  case AMDGPU::VReg_96RegClassID:
+    return 3;
+  case AMDGPU::VReg_128RegClassID:
+    return 4;
+  default:
+    llvm_unreachable("invalid MIMG register class");
+  }
+}
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(unsigned Opc,
+                                     unsigned NewChannels) const {
+  AMDGPU::Channels Channel = indexToChannel(NewChannels);
+  unsigned OrigChannels = rcToChannels(get(Opc).OpInfo[0].RegClass);
+  if (NewChannels == OrigChannels)
+    return Opc;
+
+  switch (OrigChannels) {
+  case 1:
+    return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
+  case 2:
+    return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
+  case 3:
+    return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
+  case 4:
+    return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
+  default:
+    llvm_unreachable("invalid MIMG channel");
+  }
+}
+
+
 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
 enum SIEncodingFamily {
   SI = 0,
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -82,7 +82,7 @@
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
 
-  void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+  SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
   SDValue performUCharToFloatCombine(SDNode *N,
                                      DAGCombinerInfo &DCI) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6578,9 +6578,9 @@
 }
 
 /// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
-                                       SelectionDAG &DAG) const {
-  SDNode *Users[4] = { };
+SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+                                          SelectionDAG &DAG) const {
+  SDNode *Users[4] = { nullptr };
   unsigned Lane = 0;
   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
@@ -6597,7 +6597,7 @@
     // Abort if we can't understand the usage
     if (!I->isMachineOpcode() ||
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
-      return;
+      return Node;
 
     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
     // Note that subregs are packed, i.e. Lane==0 is the first bit set
@@ -6615,7 +6615,7 @@
 
     // Abort if we have more than one user per component
     if (Users[Lane])
-      return;
+      return Node;
 
     Users[Lane] = *I;
     NewDmask |= 1 << Comp;
@@ -6623,25 +6623,41 @@
 
   // Abort if there's no change
   if (NewDmask == OldDmask)
-    return;
+    return Node;
+
+  unsigned BitsSet = countPopulation(NewDmask);
+
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  int NewOpcode = TII->getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
+  assert(NewOpcode != -1 &&
+         NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
+         "failed to find equivalent MIMG op");
 
   // Adjust the writemask in the node
-  std::vector<SDValue> Ops;
+  SmallVector<SDValue, 12> Ops;
   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
-  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
-
-  // If we only got one lane, replace it with a copy
-  // (if NewDmask has only one bit set...)
-  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
-    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
-                                       MVT::i32);
-    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
-                                      SDLoc(), Users[Lane]->getValueType(0),
-                                      SDValue(Node, 0), RC);
+
+  MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
+
+  auto NewVTList =
+    DAG.getVTList(BitsSet == 1 ?
+                  SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet),
+                  MVT::Other);
+
+  MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
+                                              NewVTList, Ops);
+  // Update chain.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
+
+  if (BitsSet == 1) {
+    assert(Node->hasNUsesOfValue(1, 0));
+    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
+                                      SDLoc(Node), Users[Lane]->getValueType(0),
+                                      SDValue(NewNode, 0));
     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
-    return;
+    return nullptr;
   }
 
   // Update the users of the node with the new indices
@@ -6651,7 +6667,7 @@
       continue;
 
     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+    DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
 
     switch (Idx) {
     default: break;
@@ -6660,6 +6676,9 @@
     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
     }
   }
+
+  DAG.RemoveDeadNode(Node);
+  return nullptr;
 }
 
 static bool isFrameIndexOp(SDValue Op) {
@@ -6717,14 +6736,16 @@
 }
 
 /// \brief Fold the instructions after selecting them.
+/// Returns null if users were already updated.
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
   unsigned Opcode = Node->getMachineOpcode();
 
   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
-      !TII->isGather4(Opcode))
-    adjustWritemask(Node, DAG);
+      !TII->isGather4(Opcode)) {
+    return adjustWritemask(Node, DAG);
+  }
 
   if (Opcode == AMDGPU::INSERT_SUBREG ||
       Opcode == AMDGPU::REG_SEQUENCE) {
@@ -6802,31 +6823,6 @@
     return;
   }
 
-  if (TII->isMIMG(MI)) {
-    unsigned VReg = MI.getOperand(0).getReg();
-    const TargetRegisterClass *RC = MRI.getRegClass(VReg);
-    // TODO: Need mapping tables to handle other cases (register classes).
-    if (RC != &AMDGPU::VReg_128RegClass)
-      return;
-
-    unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
-    unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
-    unsigned BitsSet = 0;
-    for (unsigned i = 0; i < 4; ++i)
-      BitsSet += Writemask & (1 << i) ? 1 : 0;
-    switch (BitsSet) {
-    default: return;
-    case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
-    case 2:  RC = &AMDGPU::VReg_64RegClass; break;
-    case 3:  RC = &AMDGPU::VReg_96RegClass; break;
-    }
-
-    unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
-    MI.setDesc(TII->get(NewOpcode));
-    MRI.setRegClass(VReg, RC);
-    return;
-  }
-
   // Replace unused atomics with the no return version.
   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
   if (NoRetAtomicOp != -1) {
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -1823,7 +1823,31 @@
   let ValueCols = [["Default"]];
 }
 
-def getMaskedMIMGOp : InstrMapping {
+def getMaskedMIMGOp1 : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["1"];
+  let ValueCols = [["2"], ["3"], ["4"] ];
+}
+
+def getMaskedMIMGOp2 : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["2"];
+  let ValueCols = [["1"], ["3"], ["4"] ];
+}
+
+def getMaskedMIMGOp3 : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["3"];
+  let ValueCols = [["1"], ["2"], ["4"] ];
+}
+
+def getMaskedMIMGOp4 : InstrMapping {
   let FilterClass = "MIMG_Mask";
   let RowFields = ["Op"];
   let ColFields = ["Channels"];
Index: test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}adjust_writemask_crash_0:
+; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2
+; GCN-NOT: v1
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define amdgpu_ps void @adjust_writemask_crash_0() #0 {
+main_body:
+  %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp1 = bitcast <2 x float> %tmp to <2 x i32>
+  %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+  %tmp4 = extractelement <4 x float> %tmp3, i32 0
+  store volatile float %tmp4, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_crash_1:
+; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1
+; GCN-NOT: v1
+; GCN-NOT: v0
+; GCN: buffer_store_dword v0
+define amdgpu_ps void @adjust_writemask_crash_1() #0 {
+main_body:
+  %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp1 = bitcast <2 x float> %tmp to <2 x i32>
+  %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
+  %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+  %tmp4 = extractelement <4 x float> %tmp3, i32 1
+  store volatile float %tmp4, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 {
+main_body:
+  %tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float>
+  %tmp4 = extractelement <4 x float> %tmp3, i32 0
+  store volatile float %tmp4, float addrspace(1)* undef
+  ret void
+}
+
+
+declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }