diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3292,6 +3292,17 @@
     return false;
   }
 
+  // Lets target to control the following reassociation of operands: (op (op x,
+  // c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By
+  // default consider profitable any case where N0 has single use.  This
+  // behavior reflects the condition replaced by this target hook call in the
+  // DAGCombiner.  Any particular target can implement its own heuristic to
+  // restrict common combiner.
+  virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                                   SDValue N1) const {
+    return N0.hasOneUse();
+  }
+
   virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1070,7 +1070,7 @@
         return DAG.getNode(Opc, DL, VT, N00, OpNode);
       return SDValue();
     }
-    if (N0.hasOneUse()) {
+    if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
       if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -449,6 +449,11 @@
   bool isSDNodeSourceOfDivergence(const SDNode *N,
     FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
 
+  bool hasMemSDNodeUser(SDNode *N) const;
+
+  bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                           SDValue N1) const override;
+
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
   bool isCanonicalized(Register Reg, MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9639,6 +9639,9 @@
 
 SDValue SITargetLowering::performXorCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
+    return RV;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::i64)
     return SDValue();
@@ -10551,6 +10554,9 @@
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
+  if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
+    return SDValue();
+
   unsigned Opc = N->getOpcode();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -10572,12 +10578,6 @@
   if (Op1->isDivergent())
     std::swap(Op1, Op2);
 
-  // If either operand is constant this will conflict with
-  // DAGCombiner::ReassociateOps().
-  if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
-      DAG.isConstantIntBuildVectorOrConstantInt(Op1))
-    return SDValue();
-
   SDLoc SL(N);
   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
   return DAG.getNode(Opc, SL, VT, Add1, Op2);
@@ -12578,3 +12578,27 @@
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
+  SDNode::use_iterator I = N->use_begin(), E = N->use_end();
+  for (; I != E; ++I) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
+      if (getBasePtrIndex(M) == I.getOperandNo())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                                           SDValue N1) const {
+  if (!N0.hasOneUse())
+    return false;
+  // Take care of the oportunity to keep N0 uniform
+  if (N0->isDivergent() || !N1->isDivergent())
+    return true;
+  // Check if we have a good chance to form the memory access pattern with the
+  // base and offset
+  return (DAG.isBaseWithConstantOffset(N0) &&
+          hasMemSDNodeUser(*N0->use_begin()));
+}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -550,11 +550,11 @@
 >;
 
 def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
-  [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
+  [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))]
 >;
 
 def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
-  [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
+  [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))]
 >;
 
 def S_NAND_B32 : SOP2_32 <"s_nand_b32",
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -637,9 +637,9 @@
       )
   >;
 
-def :  divergent_i64_BinOp <and, V_AND_B32_e32>;
-def :  divergent_i64_BinOp <or,  V_OR_B32_e32>;
-def :  divergent_i64_BinOp <xor, V_XOR_B32_e32>;
+def :  divergent_i64_BinOp <and, V_AND_B32_e64>;
+def :  divergent_i64_BinOp <or,  V_OR_B32_e64>;
+def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
@@ -688,6 +688,36 @@
 let isReMaterializable = 1 in
 defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
 
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))),
+  (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+  (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)),
+  (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))),
+  (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub0)),
+                            (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+                     (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub1)),
+                            (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
+def : GCNPat<
+  (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)),
+  (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub0)),
+                            (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+                     (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub1)),
+                            (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
 let Constraints = "$vdst = $src2",
     DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s
+
+; GCN-LABEL: name:            uniform_xnor_i64
+; GCN: S_XNOR_B64
+define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %xor = xor i64 %a, %b
+  %res = xor i64 %xor, -1
+  store i64 %res, i64 addrspace(1)* %out
+  ret void
+}
+; GCN-LABEL: name:            divergent_xnor_i64
+; GCN: V_XOR_B32_e64
+; GCN: V_XOR_B32_e64
+; GCN: V_NOT_B32_e32
+; GCN: V_NOT_B32_e32
+; GCN_DL: V_XNOR_B32_e64
+; GCN_DL: V_XNOR_B32_e64
+define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+  %xor = xor i64 %a, %b
+  %res = xor i64 %xor, -1
+  ret i64 %res
+}
+
+; GCN-LABEL: name:            uniform_xnor_i32
+; GCN: S_XNOR_B32
+define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %xor = xor i32 %a, %b
+  %res = xor i32 %xor, -1
+  store i32 %res, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: name:            divergent_xnor_i32
+; GCN: V_XOR_B32_e64
+; GCN: V_NOT_B32_e32
+; GCN_DL: V_XNOR_B32_e64
+define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %xor = xor i32 %a, %b
+  %res = xor i32 %xor, -1
+  ret i32 %res
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -163,8 +163,8 @@
 ; GCN-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
 ; GCN-NEXT:    v_xnor_b32_e32 v1, v1, v5
+; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
 ; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/perm.new.s b/llvm/test/CodeGen/AMDGPU/perm.new.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/perm.new.s
@@ -0,0 +1,752 @@
+	.text
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsh8_or_and                     ; -- Begin function lsh8_or_and
+	.p2align	8
+	.type	lsh8_or_and,@function
+lsh8_or_and:                            ; @lsh8_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x6050400
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end0:
+	.size	lsh8_or_and, .Lfunc_end0-lsh8_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsr24_or_and                    ; -- Begin function lsr24_or_and
+	.p2align	8
+	.type	lsr24_or_and,@function
+lsr24_or_and:                           ; @lsr24_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7060503
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, s0, v2, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end1:
+	.size	lsr24_or_and, .Lfunc_end1-lsr24_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_lsr24                    ; -- Begin function and_or_lsr24
+	.p2align	8
+	.type	and_or_lsr24,@function
+and_or_lsr24:                           ; @and_or_lsr24
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7060503
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	v_xor_b32_e32 v2, 0x80000000, v2
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end2:
+	.size	and_or_lsr24, .Lfunc_end2-and_or_lsr24
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 84
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_and                      ; -- Begin function and_or_and
+	.p2align	8
+	.type	and_or_and,@function
+and_or_and:                             ; @and_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7020500
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end3:
+	.size	and_or_and, .Lfunc_end3-and_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsh8_or_lsr24                   ; -- Begin function lsh8_or_lsr24
+	.p2align	8
+	.type	lsh8_or_lsr24,@function
+lsh8_or_lsr24:                          ; @lsh8_or_lsr24
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_alignbit_b32 v2, v2, s0, 24
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end4:
+	.size	lsh8_or_lsr24, .Lfunc_end4-lsh8_or_lsr24
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 68
+; NumSgprs: 96
+; NumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsh16_or_lsr24                  ; -- Begin function lsh16_or_lsr24
+	.p2align	8
+	.type	lsh16_or_lsr24,@function
+lsh16_or_lsr24:                         ; @lsh16_or_lsr24
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x5040c03
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end5:
+	.size	lsh16_or_lsr24, .Lfunc_end5-lsh16_or_lsr24
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_xor_and                     ; -- Begin function and_xor_and
+	.p2align	8
+	.type	and_xor_and,@function
+and_xor_and:                            ; @and_xor_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7020104
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end6:
+	.size	and_xor_and, .Lfunc_end6-and_xor_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_or_and                   ; -- Begin function and_or_or_and
+	.p2align	8
+	.type	and_or_or_and,@function
+and_or_or_and:                          ; @and_or_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_and_b32 s0, s0, 0xff00
+	s_or_b32 s0, s0, 0xffff0000
+	s_waitcnt vmcnt(0)
+	v_and_b32_e32 v2, 0xff00ff, v2
+	v_or_b32_e32 v2, s0, v2
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end7:
+	.size	and_or_or_and, .Lfunc_end7-and_or_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 88
+; NumSgprs: 96
+; NumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_and_shl                  ; -- Begin function and_or_and_shl
+	.p2align	8
+	.type	and_or_and_shl,@function
+and_or_and_shl:                         ; @and_or_and_shl
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x50c0c00
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end8:
+	.size	and_or_and_shl, .Lfunc_end8-and_or_and_shl
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	or_and_or                       ; -- Begin function or_and_or
+	.p2align	8
+	.type	or_and_or,@function
+or_and_or:                              ; @or_and_or
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7020104
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end9:
+	.size	or_and_or, .Lfunc_end9-or_and_or
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469505
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	known_ffff0500                  ; -- Begin function known_ffff0500
+	.p2align	8
+	.type	known_ffff0500,@function
+known_ffff0500:                         ; @known_ffff0500
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v5, 0xffff8004
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v4, v[0:1]
+	s_bitset1_b32 s0, 15
+	s_and_b32 s0, s0, 0xff00
+	s_or_b32 s0, s0, 0xffff0000
+	v_mov_b32_e32 v2, s2
+	v_mov_b32_e32 v3, s3
+	s_waitcnt vmcnt(0)
+	v_or_b32_e32 v4, 4, v4
+	v_and_b32_e32 v4, 0xff00ff, v4
+	v_or_b32_e32 v4, s0, v4
+	flat_store_dword v[0:1], v4
+	flat_store_dword v[2:3], v5
+	s_endpgm
+.Lfunc_end10:
+	.size	known_ffff0500, .Lfunc_end10-known_ffff0500
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 120
+; NumSgprs: 96
+; NumVgprs: 6
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 6
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469505
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	known_050c0c00                  ; -- Begin function known_050c0c00
+	.p2align	8
+	.type	known_050c0c00,@function
+known_050c0c00:                         ; @known_050c0c00
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v5, 0x50c0c00
+	v_mov_b32_e32 v6, 4
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v4, v[0:1]
+	s_or_b32 s0, s0, 4
+	v_mov_b32_e32 v2, s2
+	v_mov_b32_e32 v3, s3
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v4, v4, s0, v5
+	flat_store_dword v[0:1], v4
+	flat_store_dword v[2:3], v6
+	s_endpgm
+.Lfunc_end11:
+	.size	known_050c0c00, .Lfunc_end11-known_050c0c00
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 100
+; NumSgprs: 96
+; NumVgprs: 7
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 7
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469505
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	known_ffff8004                  ; -- Begin function known_ffff8004
+	.p2align	8
+	.type	known_ffff8004,@function
+known_ffff8004:                         ; @known_ffff8004
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v5, 0xffff0500
+	v_mov_b32_e32 v6, 0xffff8004
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v4, v[0:1]
+	s_or_b32 s0, s0, 4
+	v_mov_b32_e32 v2, s2
+	v_mov_b32_e32 v3, s3
+	s_waitcnt vmcnt(0)
+	v_or_b32_e32 v4, 0x8000, v4
+	v_perm_b32 v4, v4, s0, v5
+	flat_store_dword v[0:1], v4
+	flat_store_dword v[2:3], v6
+	s_endpgm
+.Lfunc_end12:
+	.size	known_ffff8004, .Lfunc_end12-known_ffff8004
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 112
+; NumSgprs: 96
+; NumVgprs: 7
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 7
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	".note.GNU-stack"
+	.amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx802"
diff --git a/llvm/test/CodeGen/AMDGPU/perm.old.s b/llvm/test/CodeGen/AMDGPU/perm.old.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/perm.old.s
@@ -0,0 +1,748 @@
+	.text
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsh8_or_and                     ; -- Begin function lsh8_or_and
+	.p2align	8
+	.type	lsh8_or_and,@function
+lsh8_or_and:                            ; @lsh8_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x6050400
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end0:
+	.size	lsh8_or_and, .Lfunc_end0-lsh8_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsr24_or_and                    ; -- Begin function lsr24_or_and
+	.p2align	8
+	.type	lsr24_or_and,@function
+lsr24_or_and:                           ; @lsr24_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7060503
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, s0, v2, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end1:
+	.size	lsr24_or_and, .Lfunc_end1-lsr24_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_lsr24                    ; -- Begin function and_or_lsr24
+	.p2align	8
+	.type	and_or_lsr24,@function
+and_or_lsr24:                           ; @and_or_lsr24
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7060503
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	v_xor_b32_e32 v2, 0x80000000, v2
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end2:
+	.size	and_or_lsr24, .Lfunc_end2-and_or_lsr24
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 84
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_and                      ; -- Begin function and_or_and
+	.p2align	8
+	.type	and_or_and,@function
+and_or_and:                             ; @and_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7020500
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end3:
+	.size	and_or_and, .Lfunc_end3-and_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsh8_or_lsr24                   ; -- Begin function lsh8_or_lsr24
+	.p2align	8
+	.type	lsh8_or_lsr24,@function
+lsh8_or_lsr24:                          ; @lsh8_or_lsr24
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_alignbit_b32 v2, v2, s0, 24
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end4:
+	.size	lsh8_or_lsr24, .Lfunc_end4-lsh8_or_lsr24
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 68
+; NumSgprs: 96
+; NumVgprs: 3
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 3
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	lsh16_or_lsr24                  ; -- Begin function lsh16_or_lsr24
+	.p2align	8
+	.type	lsh16_or_lsr24,@function
+lsh16_or_lsr24:                         ; @lsh16_or_lsr24
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x5040c03
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end5:
+	.size	lsh16_or_lsr24, .Lfunc_end5-lsh16_or_lsr24
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_xor_and                     ; -- Begin function and_xor_and
+	.p2align	8
+	.type	and_xor_and,@function
+and_xor_and:                            ; @and_xor_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7020104
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end6:
+	.size	and_xor_and, .Lfunc_end6-and_xor_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_or_and                   ; -- Begin function and_or_or_and
+	.p2align	8
+	.type	and_or_or_and,@function
+and_or_or_and:                          ; @and_or_or_and
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0xffff0500
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, s0, v2, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end7:
+	.size	and_or_or_and, .Lfunc_end7-and_or_or_and
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	and_or_and_shl                  ; -- Begin function and_or_and_shl
+	.p2align	8
+	.type	and_or_and_shl,@function
+and_or_and_shl:                         ; @and_or_and_shl
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x50c0c00
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end8:
+	.size	and_or_and_shl, .Lfunc_end8-and_or_and_shl
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469504
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	or_and_or                       ; -- Begin function or_and_or
+	.p2align	8
+	.type	or_and_or,@function
+or_and_or:                              ; @or_and_or
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v3, 0x7020104
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v2, v[0:1]
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v2, v2, s0, v3
+	flat_store_dword v[0:1], v2
+	s_endpgm
+.Lfunc_end9:
+	.size	or_and_or, .Lfunc_end9-or_and_or
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 76
+; NumSgprs: 96
+; NumVgprs: 4
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 0
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 4
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469505
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	known_ffff0500                  ; -- Begin function known_ffff0500
+	.p2align	8
+	.type	known_ffff0500,@function
+known_ffff0500:                         ; @known_ffff0500
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v5, 0xffff0500
+	v_mov_b32_e32 v6, 0xffff8004
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v4, v[0:1]
+	s_bitset1_b32 s0, 15
+	v_mov_b32_e32 v2, s2
+	v_mov_b32_e32 v3, s3
+	s_waitcnt vmcnt(0)
+	v_or_b32_e32 v4, 4, v4
+	v_perm_b32 v4, s0, v4, v5
+	flat_store_dword v[0:1], v4
+	flat_store_dword v[2:3], v6
+	s_endpgm
+.Lfunc_end10:
+	.size	known_ffff0500, .Lfunc_end10-known_ffff0500
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 108
+; NumSgprs: 96
+; NumVgprs: 7
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 7
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469505
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	known_050c0c00                  ; -- Begin function known_050c0c00
+	.p2align	8
+	.type	known_050c0c00,@function
+known_050c0c00:                         ; @known_050c0c00
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v5, 0x50c0c00
+	v_mov_b32_e32 v6, 4
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v4, v[0:1]
+	s_or_b32 s0, s0, 4
+	v_mov_b32_e32 v2, s2
+	v_mov_b32_e32 v3, s3
+	s_waitcnt vmcnt(0)
+	v_perm_b32 v4, v4, s0, v5
+	flat_store_dword v[0:1], v4
+	flat_store_dword v[2:3], v6
+	s_endpgm
+.Lfunc_end11:
+	.size	known_050c0c00, .Lfunc_end11-known_050c0c00
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 100
+; NumSgprs: 96
+; NumVgprs: 7
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 7
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	.AMDGPU.config
+	.long	47176
+	.long	11469505
+	.long	47180
+	.long	132
+	.long	47200
+	.long	0
+	.long	4
+	.long	0
+	.long	8
+	.long	0
+	.text
+	.globl	known_ffff8004                  ; -- Begin function known_ffff8004
+	.p2align	8
+	.type	known_ffff8004,@function
+known_ffff8004:                         ; @known_ffff8004
+; %bb.0:                                ; %bb
+	s_load_dwordx2 s[2:3], s[0:1], 0x24
+	s_load_dword s0, s[0:1], 0x2c
+	v_lshlrev_b32_e32 v0, 2, v0
+	v_mov_b32_e32 v5, 0xffff0500
+	v_mov_b32_e32 v6, 0xffff8004
+	s_waitcnt lgkmcnt(0)
+	v_mov_b32_e32 v1, s3
+	v_add_u32_e32 v0, vcc, s2, v0
+	v_addc_u32_e32 v1, vcc, 0, v1, vcc
+	flat_load_dword v4, v[0:1]
+	s_or_b32 s0, s0, 4
+	v_mov_b32_e32 v2, s2
+	v_mov_b32_e32 v3, s3
+	s_waitcnt vmcnt(0)
+	v_or_b32_e32 v4, 0x8000, v4
+	v_perm_b32 v4, v4, s0, v5
+	flat_store_dword v[0:1], v4
+	flat_store_dword v[2:3], v6
+	s_endpgm
+.Lfunc_end12:
+	.size	known_ffff8004, .Lfunc_end12-known_ffff8004
+                                        ; -- End function
+	.section	.AMDGPU.csdata
+; Kernel info:
+; codeLenInByte = 112
+; NumSgprs: 96
+; NumVgprs: 7
+; ScratchSize: 0
+; MemoryBound: 0
+; FloatMode: 240
+; IeeeMode: 1
+; LDSByteSize: 0 bytes/workgroup (compile time only)
+; SGPRBlocks: 11
+; VGPRBlocks: 1
+; NumSGPRsForWavesPerEU: 96
+; NumVGPRsForWavesPerEU: 7
+; Occupancy: 8
+; WaveLimiterHint : 1
+; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
+; COMPUTE_PGM_RSRC2:USER_SGPR: 2
+; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
+; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
+; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
+; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
+	.section	".note.GNU-stack"
+	.amd_amdgpu_isa "amdgcn-unknown-linux-gnu-gfx802"
diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll
--- a/llvm/test/CodeGen/AMDGPU/permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute.ll
@@ -106,8 +106,10 @@
 }
 
 ; GCN-LABEL: {{^}}and_or_or_and:
-; GCN: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
-; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff00
+; GCN: s_or_b32 [[SREG:s[0-9]+]], s{{[0-9]+}}, 0xffff0000
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, v{{[0-9]+}}
+; GCN: v_or_b32_e32 v{{[0-9]+}}, [[SREG]], [[VREG]]
 define amdgpu_kernel void @and_or_or_and(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -153,9 +155,12 @@
 }
 
 ; GCN-LABEL: {{^}}known_ffff0500:
-; GCN-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0500
-; GCN-DAG: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
-; GCN: v_perm_b32 v{{[0-9]+}}, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[MASK]]
+; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 0xffff8004
+; GCN: s_and_b32 [[SREG:s[0-9]+]], [[SREG]], 0xff00
+; GCN: s_or_b32 [[SREG]], [[SREG]], 0xffff0000
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 0xff00ff, [[VREG]]
+; GCN: v_or_b32_e32 [[VREG]], [[SREG]], [[VREG]]
+; GCN: store_dword v[{{[0-9:]+}}], [[VREG]]{{$}}
 ; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
 define amdgpu_kernel void @known_ffff0500(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -472,10 +472,10 @@
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O0-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v3
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT:    v_or_b32_e32 v6, v1, v2
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v1, v2
 ; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -61,8 +61,8 @@
 
 ; GCN-LABEL: {{^}}vector_xnor_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
 entry:
@@ -73,10 +73,10 @@
 
 ; GCN-LABEL: {{^}}vector_xnor_i64_one_use
 ; GCN-NOT: s_xnor_b64
-; GCN: v_not_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
@@ -150,8 +150,8 @@
 
 ; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
 entry:
@@ -162,8 +162,8 @@
 
 ; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use
 ; GCN-NOT: s_xnor_b32
-; GCN: v_not_b32
 ; GCN: v_xor_b32
+; GCN: v_not_b32
 ; GCN-DL: v_xnor_b32
 define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll
--- a/llvm/test/CodeGen/AMDGPU/xor3.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor3.ll
@@ -26,13 +26,13 @@
 define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
 ; GFX9-LABEL: xor3_vgpr_b:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX9-NEXT:    s_xor_b32 s0, s3, s2
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: xor3_vgpr_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_xor3_b32 v0, s2, v0, s3
+; GFX10-NEXT:    v_xor3_b32 v0, s3, s2, v0
 ; GFX10-NEXT:    ; return to shader part epilog
   %x = xor i32 %a, %b
   %result = xor i32 %x, %c