Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -212,6 +212,9 @@
 def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">,
   GISDNodeXFormEquiv<as_i16timm>;
 
+def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">,
+  GISDNodeXFormEquiv<as_i1timm>;
+
 def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
   GISDNodeXFormEquiv<NegateImm>;
 
Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -175,6 +175,8 @@
 
   void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
+  void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const;
 
   void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2266,6 +2266,12 @@
   MIB.addImm(MI.getOperand(OpIdx).getImm());
 }
 
+void AMDGPUInstructionSelector::renderTruncTImm1(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI,
+                                                 int OpIdx) const {
+  MIB.addImm(MI.getOperand(OpIdx).getImm());
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2132,6 +2132,15 @@
       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
       return;
     }
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16: {
+      // Doing a waterfall loop over these wouldn't make any sense.
+      substituteSimpleCopyRegs(OpdMapper, 2);
+      substituteSimpleCopyRegs(OpdMapper, 3);
+      constrainOpWithReadfirstlane(MI, MRI, 4);
+      constrainOpWithReadfirstlane(MI, MRI, 5);
+      return;
+    }
     default:
       break;
     }
@@ -3054,6 +3063,16 @@
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16: {
+      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      break;
+    }
     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -665,7 +665,7 @@
 defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
 defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
 
-def as_i1imm : SDNodeXForm<imm, [{
+def as_i1timm : SDNodeXForm<timm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
 }]>;
 
Index: llvm/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1901,7 +1901,7 @@
                            timm:$bound_ctrl)),
   (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
                         (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                        (as_i1imm $bound_ctrl))
+                        (as_i1timm $bound_ctrl))
 >;
 
 def : GCNPat <
@@ -1909,7 +1909,7 @@
                               timm:$bank_mask, timm:$bound_ctrl)),
   (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
                         (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                        (as_i1imm $bound_ctrl))
+                        (as_i1timm $bound_ctrl))
 >;
 
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/AMDGPU/SMInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/SMInstructions.td
+++ llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -769,22 +769,22 @@
   // 1. Offset as an immediate
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc),
-                                        (as_i1imm $dlc)))
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1timm $glc),
+                                        (as_i1timm $dlc)))
   >;
 
   // 2. 32-bit IMM offset on CI
   def : GCNPat <
     (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)),
-    (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> {
+    (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1timm $glc), (as_i1timm $dlc))> {
     let OtherPredicates = [isGFX7Only];
   }
 
   // 3. Offset loaded in an 32bit SGPR
   def : GCNPat <
     (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc),
-                                         (as_i1imm $dlc)))
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1timm $glc),
+                                         (as_i1timm $dlc)))
   >;
 }
 
Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -839,7 +839,7 @@
                       timm:$bound_ctrl)),
   (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                       (as_i1imm $bound_ctrl))
+                       (as_i1timm $bound_ctrl))
 >;
 
 def : GCNPat <
@@ -847,7 +847,7 @@
                       timm:$bank_mask, timm:$bound_ctrl)),
   (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                       (as_i1imm $bound_ctrl))
+                       (as_i1timm $bound_ctrl))
 >;
 
 } // End OtherPredicates = [isGFX8Plus]
Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -643,8 +643,8 @@
   Instruction inst> : GCNPat<
   (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
             timm:$fi, timm:$bc),
-  (inst (as_i1imm $fi), $src0, (as_i1imm $bc),
-        $src1, 0, $src2, $vdst_in)
+  (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+        SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
 >;
 
 // Permlane intrinsic that has either fetch invalid or bound control
@@ -656,13 +656,19 @@
                     $src1, node:$src2, node:$fi, node:$bc)> {
   let PredicateCode = [{ return N->getConstantOperandVal(5) != 0 ||
                                 N->getConstantOperandVal(6) != 0; }];
+  let GISelPredicateCode = [{
+    return MI.getOperand(6).getImm() != 0 ||
+           MI.getOperand(7).getImm() != 0;
+  }];
 }
 
 // Drop the input value if it won't be read.
 class PermlaneDiscardVDstIn<SDPatternOperator permlane,
                             Instruction inst> : GCNPat<
-  (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
-  (inst (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2,
+  (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2,
+            timm:$fi, timm:$bc),
+  (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+        SCSrc_b32:$src1, 0, SCSrc_b32:$src2,
         (IMPLICIT_DEF))
 >;
 
@@ -685,7 +691,6 @@
   def : PermlaneDiscardVDstIn<
     BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
     V_PERMLANEX16_B32>;
-
 } // End SubtargetPredicate = isGFX10Plus
 
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -287,7 +287,7 @@
     (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
             (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
             (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
-    (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
+    (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1timm $clamp))>;
 }
 
 defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.permlane.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.permlane.ll
@@ -0,0 +1 @@
+; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %S/../llvm.amdgcn.permlane.ll | FileCheck -check-prefixes=GCN,GFX10 %S/../llvm.amdgcn.permlane.ll
Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1