Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2065,8 +2065,33 @@ if (!isVCC(CCReg, *MRI)) { unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; - MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) - .addReg(CCReg); + + bool Ret = false; + MachineInstr *CopySCC = nullptr; + MachineInstr *CCDef = MRI->getVRegDef(CCReg); + + // Try to localize def of SCC and avoid generating extra SCC -> SGPR -> SCC + // copies. + if (CCDef && CCDef->getOpcode() == TargetOpcode::G_ICMP && + MRI->hasOneNonDBGUse(CCReg)) { + Register SrcReg = CCDef->getOperand(2).getReg(); + unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI); + + auto Pred = (CmpInst::Predicate)CCDef->getOperand(1).getPredicate(); + int Opcode = getS_CMPOpcode(Pred, Size); + if (Opcode == -1) + return false; + MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode)) + .add(CCDef->getOperand(2)) + .add(CCDef->getOperand(3)); + + Ret |= constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) && + RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI); + + } else { + CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(CCReg); + } // The generic constrainSelectedInstRegOperands doesn't work for the scc register // bank, because it does not cover the register class that we used to represent @@ -2077,9 +2102,9 @@ .add(I.getOperand(2)) .add(I.getOperand(3)); - bool Ret = false; Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI); - Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); + if (CopySCC) + Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI); I.eraseFromParent(); return Ret; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1835,49 +1835,45 @@ ; GCN-LABEL: s_ashr_i65: ; GCN: ; %bb.0: ; GCN-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GCN-NEXT: s_sub_i32 s10, s3, 64 -; GCN-NEXT: s_sub_i32 s8, 64, s3 +; GCN-NEXT: s_sub_i32 s2, s3, 64 +; GCN-NEXT: s_sub_i32 s10, 64, s3 ; GCN-NEXT: s_cmp_lt_u32 s3, 64 -; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 +; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s3 +; GCN-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], s3 -; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-NEXT: s_ashr_i32 s8, s5, 31 -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], s10 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GCN-NEXT: s_ashr_i32 s10, s5, 31 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], s2 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], s[8:9] +; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], s[10:11] ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_ashr_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX10PLUS-NEXT: s_sub_i32 s12, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s8, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 ; GFX10PLUS-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s14, 1, 0 -; GFX10PLUS-NEXT: s_ashr_i64 s[6:7], s[4:5], s3 -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX10PLUS-NEXT: s_ashr_i32 s10, s5, 31 -; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 +; GFX10PLUS-NEXT: s_ashr_i64 s[10:11], s[4:5], s3 +; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10PLUS-NEXT: s_ashr_i32 s2, s5, 31 ; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[4:5], s12 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10PLUS-NEXT: s_mov_b32 s11, s10 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s14, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_mov_b32 s3, s2 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GFX10PLUS-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[10:11] +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[10:11], s[2:3] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, %amount ret i65 %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -5830,40 +5830,36 @@ ; GFX6-NEXT: s_sub_i32 s9, s12, 64 ; GFX6-NEXT: s_sub_i32 s10, 64, s12 ; GFX6-NEXT: s_cmp_lt_u32 s12, 64 -; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, 0 -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s10 -; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 -; GFX6-NEXT: s_or_b64 s[12:13], s[16:17], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[18:19], s[2:3], s12 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s5, s6, 31 ; GFX6-NEXT: s_mov_b32 s4, s11 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_sub_i32 s12, s8, 64 -; GFX6-NEXT: s_sub_i32 s10, 64, s8 +; GFX6-NEXT: s_sub_i32 s9, s8, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[4:5], s12 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -5877,40 +5873,36 @@ ; GFX8-NEXT: s_sub_i32 s9, s12, 64 ; GFX8-NEXT: s_sub_i32 s10, 64, s12 ; GFX8-NEXT: s_cmp_lt_u32 s12, 64 -; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s12, 0 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s10 -; GFX8-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 -; GFX8-NEXT: s_or_b64 s[12:13], s[16:17], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[18:19], s[2:3], s12 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s5, s6, 31 ; GFX8-NEXT: s_mov_b32 s4, s11 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_sub_i32 s12, s8, 64 -; GFX8-NEXT: s_sub_i32 s10, 64, s8 +; GFX8-NEXT: s_sub_i32 s9, s8, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[4:5], s12 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -5924,40 +5916,36 @@ ; GFX9-NEXT: s_sub_i32 s9, s12, 64 ; GFX9-NEXT: s_sub_i32 s10, 64, s12 ; GFX9-NEXT: s_cmp_lt_u32 s12, 64 -; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, 0 -; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s10 -; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 -; GFX9-NEXT: s_or_b64 s[12:13], s[16:17], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[18:19], s[2:3], s12 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s12 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s5, s6, 31 ; GFX9-NEXT: s_mov_b32 s4, s11 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_sub_i32 s12, s8, 64 -; GFX9-NEXT: s_sub_i32 s10, 64, s8 +; GFX9-NEXT: s_sub_i32 s9, s8, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[4:5], s12 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] @@ -5971,42 +5959,38 @@ ; GFX10-NEXT: s_sub_i32 s9, s12, 64 ; GFX10-NEXT: s_sub_i32 s10, 64, s12 ; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s10 ; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 -; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX10-NEXT: s_lshl_b64 s[18:19], s[0:1], s12 ; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[16:17], s[18:19], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s5, s6, 31 ; GFX10-NEXT: s_mov_b32 s4, s11 -; GFX10-NEXT: s_sub_i32 s14, s8, 64 +; GFX10-NEXT: s_sub_i32 s9, s8, 64 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_sub_i32 s10, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cselect_b32 s14, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s8 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; @@ -6019,42 +6003,38 @@ ; GFX11-NEXT: s_sub_i32 s9, s12, 64 ; GFX11-NEXT: s_sub_i32 s10, 64, s12 ; GFX11-NEXT: s_cmp_lt_u32 s12, 64 -; GFX11-NEXT: s_cselect_b32 s18, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s12, 0 -; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s10 ; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 -; GFX11-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], s12 ; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX11-NEXT: s_cmp_lg_u32 s18, 0 -; GFX11-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[16:17], s[18:19], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cmp_eq_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX11-NEXT: s_lshl_b32 s5, s6, 31 ; GFX11-NEXT: s_mov_b32 s4, s11 -; GFX11-NEXT: s_sub_i32 s14, s8, 64 +; GFX11-NEXT: s_sub_i32 s9, s8, 64 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_sub_i32 s10, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: s_cselect_b32 s14, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], s8 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -6581,18 +6561,16 @@ ; GFX6-NEXT: s_sub_i32 s5, s8, 64 ; GFX6-NEXT: s_sub_i32 s9, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX6-NEXT: s_cselect_b32 s14, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2 @@ -6636,18 +6614,16 @@ ; GFX8-NEXT: s_sub_i32 s5, s8, 64 ; GFX8-NEXT: s_sub_i32 s9, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX8-NEXT: s_cselect_b32 s14, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[12:13], s[2:3], s8 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cmp_lg_u32 s14, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2 @@ -6691,19 +6667,17 @@ ; GFX9-NEXT: s_sub_i32 s5, s8, 64 ; GFX9-NEXT: s_sub_i32 s9, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX9-NEXT: s_cselect_b32 s14, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cmp_lg_u32 s14, 0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] @@ -6747,20 +6721,18 @@ ; GFX10-NEXT: s_sub_i32 s6, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_sub_i32 s0, 64, s4 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] @@ -6785,8 +6757,8 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s11, v1 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -6801,20 +6773,18 @@ ; GFX11-NEXT: s_sub_i32 s6, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1 -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_sub_i32 s0, 64, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6840,8 +6810,8 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX11-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s11, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -6862,42 +6832,40 @@ ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s10, 1, 0 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s8 +; GFX6-NEXT: s_cselect_b32 s10, 1, 0 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 ; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s5, 1, s10 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX6-NEXT: s_lshl_b32 s9, s2, 31 ; GFX6-NEXT: s_mov_b32 s8, s7 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX6-NEXT: s_and_b32 s5, 1, s10 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_sub_i32 s10, s4, 64 -; GFX6-NEXT: s_sub_i32 s8, 64, s4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 -; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_sub_i32 s5, s4, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s4 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 @@ -6917,42 +6885,40 @@ ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX8-NEXT: s_cselect_b32 s10, 1, 0 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s5, 1, s10 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX8-NEXT: s_lshl_b32 s9, s2, 31 ; GFX8-NEXT: s_mov_b32 s8, s7 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX8-NEXT: s_and_b32 s5, 1, s10 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_sub_i32 s10, s4, 64 -; GFX8-NEXT: s_sub_i32 s8, 64, s4 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_sub_i32 s5, s4, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s4 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 @@ -6972,42 +6938,40 @@ ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s10, 1, 0 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX9-NEXT: s_cselect_b32 s10, 1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s5, 1, s10 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX9-NEXT: s_lshl_b32 s9, s2, 31 ; GFX9-NEXT: s_mov_b32 s8, s7 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 -; GFX9-NEXT: s_and_b32 s5, 1, s10 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_sub_i32 s10, s4, 64 -; GFX9-NEXT: s_sub_i32 s8, 64, s4 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 -; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_sub_i32 s5, s4, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s4 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 @@ -7022,8 +6986,8 @@ ; GFX10-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s8, 64 ; GFX10-NEXT: s_sub_i32 s6, 64, s8 +; GFX10-NEXT: s_sub_i32 s5, s8, 64 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] @@ -7032,41 +6996,39 @@ ; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX10-NEXT: s_cselect_b32 s10, 1, 0 ; GFX10-NEXT: s_and_b32 s6, 1, s9 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_lshl_b32 s9, s2, 31 ; GFX10-NEXT: s_mov_b32 s8, s7 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_and_b32 s5, 1, s10 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_sub_i32 s10, s4, 64 +; GFX10-NEXT: s_sub_i32 s12, s4, 64 ; GFX10-NEXT: s_sub_i32 s8, 64, s4 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 ; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[2:3], s4 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s12 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[10:11], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7078,8 +7040,8 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] ; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s8, 64 ; GFX11-NEXT: s_sub_i32 s6, 64, s8 +; GFX11-NEXT: s_sub_i32 s5, s8, 64 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] @@ -7088,38 +7050,36 @@ ; GFX11-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] ; GFX11-NEXT: s_cselect_b32 s10, 1, 0 ; GFX11-NEXT: s_and_b32 s6, 1, s9 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_lshl_b32 s9, s2, 31 ; GFX11-NEXT: s_mov_b32 s8, s7 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: s_and_b32 s5, 1, s10 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_sub_i32 s10, s4, 64 +; GFX11-NEXT: s_sub_i32 s12, s4, 64 ; GFX11-NEXT: s_sub_i32 s8, 64, s4 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 ; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[2:3], s4 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s12 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[10:11], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7258,82 +7218,74 @@ ; GFX6-NEXT: s_sub_i32 s17, s22, 64 ; GFX6-NEXT: s_sub_i32 s23, 64, s22 ; GFX6-NEXT: s_cmp_lt_u32 s22, 64 -; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s22, 0 -; GFX6-NEXT: s_cselect_b32 s29, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX6-NEXT: s_cselect_b32 s30, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GFX6-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX6-NEXT: s_lshl_b64 s[28:29], s[2:3], s22 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX6-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cmp_lg_u32 s30, 0 ; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[26:27], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 ; GFX6-NEXT: s_lshl_b32 s9, s10, 31 ; GFX6-NEXT: s_mov_b32 s8, s19 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX6-NEXT: s_sub_i32 s26, s16, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s16 +; GFX6-NEXT: s_sub_i32 s17, s16, 64 +; GFX6-NEXT: s_sub_i32 s26, 64, s16 ; GFX6-NEXT: s_cmp_lt_u32 s16, 64 -; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, 0 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[22:23], s[0:1], s16 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[8:9], s26 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 -; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] +; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[26:27] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s17 ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[22:23], s[8:9] +; GFX6-NEXT: s_cmp_eq_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX6-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX6-NEXT: s_sub_i32 s11, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_sub_i32 s9, s8, 64 +; GFX6-NEXT: s_sub_i32 s11, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s22, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[20:21], s[4:5], s11 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[6:7], s8 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[20:21], s[4:5], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX6-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 ; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] -; GFX6-NEXT: s_cmp_lg_u32 s22, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[20:21], s[4:5] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 ; GFX6-NEXT: s_lshl_b32 s9, s14, 31 ; GFX6-NEXT: s_mov_b32 s8, s19 ; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX6-NEXT: s_sub_i32 s18, s10, 64 -; GFX6-NEXT: s_sub_i32 s14, 64, s10 +; GFX6-NEXT: s_sub_i32 s11, s10, 64 +; GFX6-NEXT: s_sub_i32 s18, 64, s10 ; GFX6-NEXT: s_cmp_lt_u32 s10, 64 -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 ; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s11 ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[14:15], s[8:9] +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 ; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] @@ -7347,82 +7299,74 @@ ; GFX8-NEXT: s_sub_i32 s17, s22, 64 ; GFX8-NEXT: s_sub_i32 s23, 64, s22 ; GFX8-NEXT: s_cmp_lt_u32 s22, 64 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s22, 0 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GFX8-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX8-NEXT: s_lshl_b64 s[28:29], s[2:3], s22 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX8-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cmp_lg_u32 s30, 0 ; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[26:27], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 ; GFX8-NEXT: s_lshl_b32 s9, s10, 31 ; GFX8-NEXT: s_mov_b32 s8, s19 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX8-NEXT: s_sub_i32 s26, s16, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s16 +; GFX8-NEXT: s_sub_i32 s17, s16, 64 +; GFX8-NEXT: s_sub_i32 s26, 64, s16 ; GFX8-NEXT: s_cmp_lt_u32 s16, 64 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s16, 0 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[22:23], s[0:1], s16 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[8:9], s26 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 -; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] +; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[26:27] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s17 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[22:23], s[8:9] +; GFX8-NEXT: s_cmp_eq_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX8-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX8-NEXT: s_sub_i32 s11, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_sub_i32 s9, s8, 64 +; GFX8-NEXT: s_sub_i32 s11, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[20:21], s[4:5], s11 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[6:7], s8 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[20:21], s[4:5], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX8-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 ; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] -; GFX8-NEXT: s_cmp_lg_u32 s22, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[20:21], s[4:5] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 ; GFX8-NEXT: s_lshl_b32 s9, s14, 31 ; GFX8-NEXT: s_mov_b32 s8, s19 ; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX8-NEXT: s_sub_i32 s18, s10, 64 -; GFX8-NEXT: s_sub_i32 s14, 64, s10 +; GFX8-NEXT: s_sub_i32 s11, s10, 64 +; GFX8-NEXT: s_sub_i32 s18, 64, s10 ; GFX8-NEXT: s_cmp_lt_u32 s10, 64 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 ; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s11 ; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[14:15], s[8:9] +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 ; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] @@ -7436,82 +7380,74 @@ ; GFX9-NEXT: s_sub_i32 s17, s22, 64 ; GFX9-NEXT: s_sub_i32 s23, 64, s22 ; GFX9-NEXT: s_cmp_lt_u32 s22, 64 -; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s22, 0 -; GFX9-NEXT: s_cselect_b32 s29, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GFX9-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX9-NEXT: s_lshl_b64 s[28:29], s[2:3], s22 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX9-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cmp_lg_u32 s30, 0 ; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] -; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[26:27], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 ; GFX9-NEXT: s_lshl_b32 s9, s10, 31 ; GFX9-NEXT: s_mov_b32 s8, s19 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX9-NEXT: s_sub_i32 s26, s16, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s16 +; GFX9-NEXT: s_sub_i32 s17, s16, 64 +; GFX9-NEXT: s_sub_i32 s26, 64, s16 ; GFX9-NEXT: s_cmp_lt_u32 s16, 64 -; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, 0 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[0:1], s16 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[8:9], s26 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 -; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] +; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[26:27] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s17 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[22:23], s[8:9] +; GFX9-NEXT: s_cmp_eq_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX9-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX9-NEXT: s_sub_i32 s11, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_sub_i32 s9, s8, 64 +; GFX9-NEXT: s_sub_i32 s11, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], s11 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[6:7], s8 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[20:21], s[4:5], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[20:21], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX9-NEXT: s_or_b64 s[20:21], s[20:21], s[22:23] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 ; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] -; GFX9-NEXT: s_cmp_lg_u32 s22, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[20:21], s[4:5] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 ; GFX9-NEXT: s_lshl_b32 s9, s14, 31 ; GFX9-NEXT: s_mov_b32 s8, s19 ; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX9-NEXT: s_sub_i32 s18, s10, 64 -; GFX9-NEXT: s_sub_i32 s14, 64, s10 +; GFX9-NEXT: s_sub_i32 s11, s10, 64 +; GFX9-NEXT: s_sub_i32 s18, 64, s10 ; GFX9-NEXT: s_cmp_lt_u32 s10, 64 -; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 ; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s11 ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[14:15], s[8:9] +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 ; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] ; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] @@ -7525,85 +7461,77 @@ ; GFX10-NEXT: s_sub_i32 s17, s22, 64 ; GFX10-NEXT: s_sub_i32 s23, 64, s22 ; GFX10-NEXT: s_cmp_lt_u32 s22, 64 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s22, 0 -; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_cselect_b32 s30, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 -; GFX10-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX10-NEXT: s_lshl_b64 s[28:29], s[0:1], s22 ; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX10-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s29, 0 +; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 ; GFX10-NEXT: s_lshl_b32 s9, s10, 31 ; GFX10-NEXT: s_mov_b32 s8, s19 -; GFX10-NEXT: s_sub_i32 s26, s16, 64 +; GFX10-NEXT: s_sub_i32 s17, s16, 64 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX10-NEXT: s_sub_i32 s17, 64, s16 +; GFX10-NEXT: s_sub_i32 s22, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 -; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX10-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[22:23] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s17 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[24:25], 0 ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX10-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] -; GFX10-NEXT: s_sub_i32 s11, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_or_b64 s[0:1], s[26:27], s[0:1] +; GFX10-NEXT: s_sub_i32 s9, s8, 64 +; GFX10-NEXT: s_sub_i32 s11, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 ; GFX10-NEXT: s_lshl_b64 s[20:21], s[6:7], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[4:5], s8 ; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] -; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[20:21], s[22:23], 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s22, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_mov_b32 s8, s19 ; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX10-NEXT: s_lshl_b32 s13, s14, 31 -; GFX10-NEXT: s_mov_b32 s12, s19 -; GFX10-NEXT: s_sub_i32 s18, s10, 64 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] -; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 +; GFX10-NEXT: s_lshl_b32 s9, s14, 31 +; GFX10-NEXT: s_sub_i32 s11, s10, 64 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX10-NEXT: s_sub_i32 s14, 64, s10 ; GFX10-NEXT: s_cmp_lt_u32 s10, 64 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s10 +; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s11 +; GFX10-NEXT: s_cmp_lg_u32 s18, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX10-NEXT: s_cmp_eq_u32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 -; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s18, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX10-NEXT: s_or_b64 s[4:5], s[20:21], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i128: @@ -7615,85 +7543,77 @@ ; GFX11-NEXT: s_sub_i32 s17, s22, 64 ; GFX11-NEXT: s_sub_i32 s23, 64, s22 ; GFX11-NEXT: s_cmp_lt_u32 s22, 64 -; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s22, 0 -; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_cselect_b32 s30, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 ; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 -; GFX11-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX11-NEXT: s_lshl_b64 s[28:29], s[0:1], s22 ; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX11-NEXT: s_cmp_lg_u32 s30, 0 +; GFX11-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s29, 0 +; GFX11-NEXT: s_cmp_eq_u32 s22, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 ; GFX11-NEXT: s_lshl_b32 s9, s10, 31 ; GFX11-NEXT: s_mov_b32 s8, s19 -; GFX11-NEXT: s_sub_i32 s26, s16, 64 +; GFX11-NEXT: s_sub_i32 s17, s16, 64 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX11-NEXT: s_sub_i32 s17, 64, s16 +; GFX11-NEXT: s_sub_i32 s22, 64, s16 ; GFX11-NEXT: s_cmp_lt_u32 s16, 64 -; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s16, 0 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: s_lshl_b64 s[22:23], s[8:9], s22 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[22:23] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s17 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: s_cmp_eq_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[24:25], 0 ; GFX11-NEXT: s_and_not1_b64 s[10:11], s[18:19], s[20:21] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] -; GFX11-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] -; GFX11-NEXT: s_sub_i32 s11, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_or_b64 s[0:1], s[26:27], s[0:1] +; GFX11-NEXT: s_sub_i32 s9, s8, 64 +; GFX11-NEXT: s_sub_i32 s11, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: s_cselect_b32 s18, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s22, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 ; GFX11-NEXT: s_lshl_b64 s[20:21], s[6:7], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[22:23], s[4:5], s8 ; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 ; GFX11-NEXT: s_cmp_lg_u32 s18, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[20:21], s[22:23], 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_mov_b32 s8, s19 ; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX11-NEXT: s_lshl_b32 s13, s14, 31 -; GFX11-NEXT: s_mov_b32 s12, s19 -; GFX11-NEXT: s_sub_i32 s18, s10, 64 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] -; GFX11-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 +; GFX11-NEXT: s_lshl_b32 s9, s14, 31 +; GFX11-NEXT: s_sub_i32 s11, s10, 64 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX11-NEXT: s_sub_i32 s14, 64, s10 ; GFX11-NEXT: s_cmp_lt_u32 s10, 64 -; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_cselect_b32 s18, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s10 +; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s11 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX11-NEXT: s_cmp_eq_u32 s10, 0 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 -; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[20:21], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -5888,38 +5888,34 @@ ; GFX6-NEXT: s_lshr_b32 s10, s1, 31 ; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] -; GFX6-NEXT: s_sub_i32 s13, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_sub_i32 s9, s8, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[14:15], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[14:15], s8 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[14:15], s[14:15], s9 +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_sub_i32 s14, s12, 64 -; GFX6-NEXT: s_sub_i32 s13, 64, s12 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[10:11] +; GFX6-NEXT: s_sub_i32 s13, s12, 64 +; GFX6-NEXT: s_sub_i32 s14, 64, s12 ; GFX6-NEXT: s_cmp_lt_u32 s12, 64 -; GFX6-NEXT: s_cselect_b32 s15, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 ; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 -; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] -; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX6-NEXT: s_cmp_lg_u32 s15, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX6-NEXT: s_lshl_b64 s[14:15], s[6:7], s14 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] ; GFX6-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] @@ -5935,38 +5931,34 @@ ; GFX8-NEXT: s_lshr_b32 s10, s1, 31 ; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] -; GFX8-NEXT: s_sub_i32 s13, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_sub_i32 s9, s8, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[14:15], s10 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[14:15], s8 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[14:15], s[14:15], s9 +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s17, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX8-NEXT: s_sub_i32 s14, s12, 64 -; GFX8-NEXT: s_sub_i32 s13, 64, s12 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[10:11] +; GFX8-NEXT: s_sub_i32 s13, s12, 64 +; GFX8-NEXT: s_sub_i32 s14, 64, s12 ; GFX8-NEXT: s_cmp_lt_u32 s12, 64 -; GFX8-NEXT: s_cselect_b32 s15, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 ; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 -; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 -; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] -; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX8-NEXT: s_cmp_lg_u32 s15, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX8-NEXT: s_lshl_b64 s[14:15], s[6:7], s14 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] ; GFX8-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] @@ -5982,38 +5974,34 @@ ; GFX9-NEXT: s_lshr_b32 s10, s1, 31 ; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], 1 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[10:11] -; GFX9-NEXT: s_sub_i32 s13, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_sub_i32 s9, s8, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], s10 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[14:15], s8 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[14:15], s[14:15], s9 +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s17, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX9-NEXT: s_sub_i32 s14, s12, 64 -; GFX9-NEXT: s_sub_i32 s13, 64, s12 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[10:11] +; GFX9-NEXT: s_sub_i32 s13, s12, 64 +; GFX9-NEXT: s_sub_i32 s14, 64, s12 ; GFX9-NEXT: s_cmp_lt_u32 s12, 64 -; GFX9-NEXT: s_cselect_b32 s15, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 ; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 -; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 -; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] -; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX9-NEXT: s_lshl_b64 s[14:15], s[6:7], s14 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] ; GFX9-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] @@ -6029,40 +6017,36 @@ ; GFX10-NEXT: s_lshr_b32 s10, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] -; GFX10-NEXT: s_sub_i32 s13, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_sub_i32 s9, s8, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 ; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s13 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s14, s12, 64 -; GFX10-NEXT: s_sub_i32 s10, 64, s12 +; GFX10-NEXT: s_sub_i32 s13, s12, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s12 ; GFX10-NEXT: s_cmp_lt_u32 s12, 64 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[6:7], s12 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; @@ -6076,40 +6060,36 @@ ; GFX11-NEXT: s_lshr_b32 s10, s1, 31 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] -; GFX11-NEXT: s_sub_i32 s13, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_sub_i32 s9, s8, 64 +; GFX11-NEXT: s_sub_i32 s10, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s17, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 ; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 ; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s13 -; GFX11-NEXT: s_cmp_lg_u32 s16, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s14, s12, 64 -; GFX11-NEXT: s_sub_i32 s10, 64, s12 +; GFX11-NEXT: s_sub_i32 s13, s12, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s12 ; GFX11-NEXT: s_cmp_lt_u32 s12, 64 -; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s12, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX11-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[6:7], s12 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s13 ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -6636,22 +6616,20 @@ ; GFX6-NEXT: s_lshr_b32 s6, s1, 31 ; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX6-NEXT: s_sub_i32 s9, s4, 64 -; GFX6-NEXT: s_sub_i32 s5, 64, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, 64 +; GFX6-NEXT: s_sub_i32 s6, 64, s4 ; GFX6-NEXT: s_cmp_lt_u32 s4, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], s4 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s4 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX6-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s5 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX6-NEXT: s_sub_i32 s4, s8, 64 ; GFX6-NEXT: s_sub_i32 s5, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 @@ -6692,22 +6670,20 @@ ; GFX8-NEXT: s_lshr_b32 s6, s1, 31 ; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX8-NEXT: s_sub_i32 s9, s4, 64 -; GFX8-NEXT: s_sub_i32 s5, 64, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, 64 +; GFX8-NEXT: s_sub_i32 s6, 64, s4 ; GFX8-NEXT: s_cmp_lt_u32 s4, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], s4 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s4 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX8-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s5 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX8-NEXT: s_sub_i32 s4, s8, 64 ; GFX8-NEXT: s_sub_i32 s5, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 @@ -6748,22 +6724,20 @@ ; GFX9-NEXT: s_lshr_b32 s6, s1, 31 ; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX9-NEXT: s_sub_i32 s9, s4, 64 -; GFX9-NEXT: s_sub_i32 s5, 64, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, 64 +; GFX9-NEXT: s_sub_i32 s6, 64, s4 ; GFX9-NEXT: s_cmp_lt_u32 s4, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], s4 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s4 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[10:11], s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; GFX9-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s5 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_i32 s4, s8, 64 ; GFX9-NEXT: s_sub_i32 s5, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 @@ -6804,22 +6778,20 @@ ; GFX10-NEXT: s_lshr_b32 s6, s1, 31 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GFX10-NEXT: s_sub_i32 s9, s4, 64 -; GFX10-NEXT: s_sub_i32 s5, 64, s4 +; GFX10-NEXT: s_sub_i32 s5, s4, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s4 ; GFX10-NEXT: s_cmp_lt_u32 s4, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s4 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_sub_i32 s0, 64, s8 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] @@ -6829,11 +6801,11 @@ ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10-NEXT: s_and_b32 s0, 1, s1 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s6 +; GFX10-NEXT: s_and_b32 s0, 1, s4 ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] ; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo @@ -6844,8 +6816,8 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 -; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s11, v1 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog @@ -6860,22 +6832,20 @@ ; GFX11-NEXT: s_lshr_b32 s6, s1, 31 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GFX11-NEXT: s_sub_i32 s9, s4, 64 -; GFX11-NEXT: s_sub_i32 s5, 64, s4 +; GFX11-NEXT: s_sub_i32 s5, s4, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s4 ; GFX11-NEXT: s_cmp_lt_u32 s4, 64 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[0:1], s4 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_sub_i32 s0, 64, s8 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -6886,11 +6856,11 @@ ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 ; GFX11-NEXT: s_and_b32 s0, 1, s1 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s6 +; GFX11-NEXT: s_and_b32 s0, 1, s4 ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] ; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 @@ -6901,8 +6871,8 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX11-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX11-NEXT: v_or_b32_e32 v0, s10, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s11, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -6934,30 +6904,28 @@ ; GFX6-NEXT: s_and_b32 s4, 1, s7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX6-NEXT: s_and_b32 s4, 1, s9 -; GFX6-NEXT: s_sub_i32 s10, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_sub_i32 s9, s8, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s8 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s5 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s9 ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 @@ -6988,30 +6956,28 @@ ; GFX8-NEXT: s_and_b32 s4, 1, s7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX8-NEXT: s_and_b32 s4, 1, s9 -; GFX8-NEXT: s_sub_i32 s10, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_sub_i32 s9, s8, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s8 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s9 ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 @@ -7042,30 +7008,28 @@ ; GFX9-NEXT: s_and_b32 s4, 1, s7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 ; GFX9-NEXT: s_and_b32 s4, 1, s9 -; GFX9-NEXT: s_sub_i32 s10, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_sub_i32 s9, s8, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s8 ; GFX9-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v1, v7 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s9 ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc ; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 @@ -7099,31 +7063,29 @@ ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX10-NEXT: s_and_b32 s4, 1, s9 -; GFX10-NEXT: s_sub_i32 s10, s8, 64 +; GFX10-NEXT: s_sub_i32 s9, s8, 64 ; GFX10-NEXT: s_sub_i32 s6, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[2:3], s8 ; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s9 +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[10:11], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7153,28 +7115,26 @@ ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX11-NEXT: s_and_b32 s4, 1, s9 -; GFX11-NEXT: s_sub_i32 s10, s8, 64 +; GFX11-NEXT: s_sub_i32 s9, s8, 64 ; GFX11-NEXT: s_sub_i32 s6, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 -; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 ; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[2:3], s8 ; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s9 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[10:11], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7321,38 +7281,34 @@ ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, s19 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_sub_i32 s23, s16, 64 -; GFX6-NEXT: s_sub_i32 s17, 64, s16 +; GFX6-NEXT: s_sub_i32 s17, s16, 64 +; GFX6-NEXT: s_sub_i32 s23, 64, s16 ; GFX6-NEXT: s_cmp_lt_u32 s16, 64 -; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, 0 -; GFX6-NEXT: s_cselect_b32 s29, 1, 0 +; GFX6-NEXT: s_cselect_b32 s30, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[26:27], s[24:25], s23 +; GFX6-NEXT: s_lshl_b64 s[28:29], s[0:1], s16 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX6-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 -; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] +; GFX6-NEXT: s_lshl_b64 s[24:25], s[24:25], s17 +; GFX6-NEXT: s_cmp_lg_u32 s30, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] -; GFX6-NEXT: s_cmp_lg_u32 s29, 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX6-NEXT: s_sub_i32 s26, s22, 64 -; GFX6-NEXT: s_sub_i32 s24, 64, s22 +; GFX6-NEXT: s_cselect_b64 s[24:25], s[26:27], s[24:25] +; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[24:25] +; GFX6-NEXT: s_sub_i32 s23, s22, 64 +; GFX6-NEXT: s_sub_i32 s26, 64, s22 ; GFX6-NEXT: s_cmp_lt_u32 s22, 64 -; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s22, 0 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[24:25], s[8:9], s22 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[10:11], s26 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX6-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 -; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s23 ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[24:25], s[10:11] +; GFX6-NEXT: s_cmp_eq_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] @@ -7365,35 +7321,31 @@ ; GFX6-NEXT: s_sub_i32 s9, s10, 64 ; GFX6-NEXT: s_sub_i32 s11, 64, s10 ; GFX6-NEXT: s_cmp_lt_u32 s10, 64 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 -; GFX6-NEXT: s_cselect_b32 s21, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX6-NEXT: s_cselect_b32 s22, 1, 0 ; GFX6-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[20:21], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX6-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] ; GFX6-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cmp_lg_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] -; GFX6-NEXT: s_cmp_lg_u32 s21, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX6-NEXT: s_sub_i32 s18, s8, 64 -; GFX6-NEXT: s_sub_i32 s16, 64, s8 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[18:19], s[16:17] +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[4:5], s[16:17] +; GFX6-NEXT: s_sub_i32 s9, s8, 64 +; GFX6-NEXT: s_sub_i32 s18, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[12:13], s8 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[14:15], s18 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] -; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s9 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[14:15], s[16:17], s[14:15] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[14:15] ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 ; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] ; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] @@ -7410,38 +7362,34 @@ ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, s19 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_sub_i32 s23, s16, 64 -; GFX8-NEXT: s_sub_i32 s17, 64, s16 +; GFX8-NEXT: s_sub_i32 s17, s16, 64 +; GFX8-NEXT: s_sub_i32 s23, 64, s16 ; GFX8-NEXT: s_cmp_lt_u32 s16, 64 -; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s16, 0 -; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[26:27], s[24:25], s23 +; GFX8-NEXT: s_lshl_b64 s[28:29], s[0:1], s16 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX8-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 -; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] +; GFX8-NEXT: s_lshl_b64 s[24:25], s[24:25], s17 +; GFX8-NEXT: s_cmp_lg_u32 s30, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] -; GFX8-NEXT: s_cmp_lg_u32 s29, 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX8-NEXT: s_sub_i32 s26, s22, 64 -; GFX8-NEXT: s_sub_i32 s24, 64, s22 +; GFX8-NEXT: s_cselect_b64 s[24:25], s[26:27], s[24:25] +; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[24:25] +; GFX8-NEXT: s_sub_i32 s23, s22, 64 +; GFX8-NEXT: s_sub_i32 s26, 64, s22 ; GFX8-NEXT: s_cmp_lt_u32 s22, 64 -; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s22, 0 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[24:25], s[8:9], s22 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[10:11], s26 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX8-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 -; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s23 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[24:25], s[10:11] +; GFX8-NEXT: s_cmp_eq_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] @@ -7454,35 +7402,31 @@ ; GFX8-NEXT: s_sub_i32 s9, s10, 64 ; GFX8-NEXT: s_sub_i32 s11, 64, s10 ; GFX8-NEXT: s_cmp_lt_u32 s10, 64 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 -; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 ; GFX8-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[20:21], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX8-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] ; GFX8-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cmp_lg_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] -; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX8-NEXT: s_sub_i32 s18, s8, 64 -; GFX8-NEXT: s_sub_i32 s16, 64, s8 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[18:19], s[16:17] +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[4:5], s[16:17] +; GFX8-NEXT: s_sub_i32 s9, s8, 64 +; GFX8-NEXT: s_sub_i32 s18, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[12:13], s8 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[14:15], s18 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] -; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s9 +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[14:15], s[16:17], s[14:15] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[14:15] ; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] -; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 ; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] ; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] @@ -7499,38 +7443,34 @@ ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, s19 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_sub_i32 s23, s16, 64 -; GFX9-NEXT: s_sub_i32 s17, 64, s16 +; GFX9-NEXT: s_sub_i32 s17, s16, 64 +; GFX9-NEXT: s_sub_i32 s23, 64, s16 ; GFX9-NEXT: s_cmp_lt_u32 s16, 64 -; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, 0 -; GFX9-NEXT: s_cselect_b32 s29, 1, 0 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[24:25], s23 +; GFX9-NEXT: s_lshl_b64 s[28:29], s[0:1], s16 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX9-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 -; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] +; GFX9-NEXT: s_lshl_b64 s[24:25], s[24:25], s17 +; GFX9-NEXT: s_cmp_lg_u32 s30, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] -; GFX9-NEXT: s_cmp_lg_u32 s29, 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX9-NEXT: s_sub_i32 s26, s22, 64 -; GFX9-NEXT: s_sub_i32 s24, 64, s22 +; GFX9-NEXT: s_cselect_b64 s[24:25], s[26:27], s[24:25] +; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[24:25] +; GFX9-NEXT: s_sub_i32 s23, s22, 64 +; GFX9-NEXT: s_sub_i32 s26, 64, s22 ; GFX9-NEXT: s_cmp_lt_u32 s22, 64 -; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s22, 0 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[8:9], s22 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[10:11], s26 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 -; GFX9-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 -; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] -; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s23 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[24:25], s[10:11] +; GFX9-NEXT: s_cmp_eq_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] ; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] @@ -7543,35 +7483,31 @@ ; GFX9-NEXT: s_sub_i32 s9, s10, 64 ; GFX9-NEXT: s_sub_i32 s11, 64, s10 ; GFX9-NEXT: s_cmp_lt_u32 s10, 64 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 -; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 ; GFX9-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[20:21], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX9-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] ; GFX9-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] -; GFX9-NEXT: s_cmp_lg_u32 s21, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX9-NEXT: s_sub_i32 s18, s8, 64 -; GFX9-NEXT: s_sub_i32 s16, 64, s8 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[18:19], s[16:17] +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[4:5], s[16:17] +; GFX9-NEXT: s_sub_i32 s9, s8, 64 +; GFX9-NEXT: s_sub_i32 s18, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[12:13], s8 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[14:15], s18 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] -; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s9 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[14:15], s[16:17], s[14:15] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[14:15] ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] -; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 ; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] ; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] @@ -7588,81 +7524,73 @@ ; GFX10-NEXT: s_mov_b32 s25, s19 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] -; GFX10-NEXT: s_sub_i32 s23, s16, 64 -; GFX10-NEXT: s_sub_i32 s17, 64, s16 +; GFX10-NEXT: s_sub_i32 s17, s16, 64 +; GFX10-NEXT: s_sub_i32 s23, 64, s16 ; GFX10-NEXT: s_cmp_lt_u32 s16, 64 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s16, 0 -; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 +; GFX10-NEXT: s_cselect_b32 s30, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 ; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 +; GFX10-NEXT: s_lshl_b64 s[28:29], s[0:1], s16 ; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s23 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 +; GFX10-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s29, 0 +; GFX10-NEXT: s_cmp_eq_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s26, s22, 64 -; GFX10-NEXT: s_sub_i32 s23, 64, s22 +; GFX10-NEXT: s_sub_i32 s23, s22, 64 +; GFX10-NEXT: s_sub_i32 s16, 64, s22 ; GFX10-NEXT: s_cmp_lt_u32 s22, 64 -; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 -; GFX10-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[10:11], s22 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s23 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_cmp_eq_u32 s22, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[24:25], 0 ; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: s_lshr_b32 s18, s5, 31 -; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[26:27], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] ; GFX10-NEXT: s_sub_i32 s9, s10, 64 ; GFX10-NEXT: s_sub_i32 s11, 64, s10 ; GFX10-NEXT: s_cmp_lt_u32 s10, 64 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 ; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX10-NEXT: s_lshl_b64 s[20:21], s[4:5], s10 ; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX10-NEXT: s_cmp_lg_u32 s22, 0 +; GFX10-NEXT: s_cselect_b64 s[18:19], s[20:21], 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 ; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] -; GFX10-NEXT: s_sub_i32 s18, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_sub_i32 s9, s8, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] +; GFX10-NEXT: s_lshl_b64 s[10:11], s[14:15], s10 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[14:15], s8 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 ; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX10-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog ; @@ -7677,81 +7605,73 @@ ; GFX11-NEXT: s_mov_b32 s25, s19 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] -; GFX11-NEXT: s_sub_i32 s23, s16, 64 -; GFX11-NEXT: s_sub_i32 s17, 64, s16 +; GFX11-NEXT: s_sub_i32 s17, s16, 64 +; GFX11-NEXT: s_sub_i32 s23, 64, s16 ; GFX11-NEXT: s_cmp_lt_u32 s16, 64 -; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s16, 0 -; GFX11-NEXT: s_cselect_b32 s29, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 +; GFX11-NEXT: s_cselect_b32 s30, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 ; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 +; GFX11-NEXT: s_lshl_b64 s[28:29], s[0:1], s16 ; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s23 -; GFX11-NEXT: s_cmp_lg_u32 s28, 0 -; GFX11-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 +; GFX11-NEXT: s_cmp_lg_u32 s30, 0 +; GFX11-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s29, 0 +; GFX11-NEXT: s_cmp_eq_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s26, s22, 64 -; GFX11-NEXT: s_sub_i32 s23, 64, s22 +; GFX11-NEXT: s_sub_i32 s23, s22, 64 +; GFX11-NEXT: s_sub_i32 s16, 64, s22 ; GFX11-NEXT: s_cmp_lt_u32 s22, 64 -; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s22, 0 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 -; GFX11-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[10:11], s22 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[16:17] +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s23 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_cmp_eq_u32 s22, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[24:25], 0 ; GFX11-NEXT: s_and_not1_b64 s[10:11], s[18:19], s[20:21] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] ; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 ; GFX11-NEXT: s_lshr_b32 s18, s5, 31 -; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX11-NEXT: s_or_b64 s[0:1], s[26:27], s[0:1] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] ; GFX11-NEXT: s_sub_i32 s9, s10, 64 ; GFX11-NEXT: s_sub_i32 s11, 64, s10 ; GFX11-NEXT: s_cmp_lt_u32 s10, 64 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 -; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 ; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX11-NEXT: s_lshl_b64 s[20:21], s[4:5], s10 ; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_cselect_b64 s[18:19], s[20:21], 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 ; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] -; GFX11-NEXT: s_sub_i32 s18, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_sub_i32 s9, s8, 64 +; GFX11-NEXT: s_sub_i32 s10, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s19, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b32 s20, 1, 0 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] -; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] +; GFX11-NEXT: s_lshl_b64 s[10:11], s[14:15], s10 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[14:15], s8 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] +; GFX11-NEXT: s_lshr_b64 s[10:11], s[14:15], s9 ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-icmp.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-icmp.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-icmp.mir @@ -18,12 +18,8 @@ # GCN: [[SGPR6:%[0-9]+]]:sreg_32 = COPY $sgpr6 # GCN: [[SGPR7:%[0-9]+]]:sreg_32 = COPY $sgpr7 # GCN: S_CMP_LG_U32 [[SGPR0]], [[SGPR1]], implicit-def $scc -# GCN-NEXT: [[COND0:%[0-9]+]]:sreg_32 = COPY $scc -# GCN: S_CMP_LG_U32 [[SGPR4]], [[SGPR5]], implicit-def $scc -# GCN-NEXT: [[COND1:%[0-9]+]]:sreg_32 = COPY $scc -# GCN: $scc = COPY [[COND0]] # GCN-NEXT: S_CSELECT_B32 [[SGPR6]], [[SGPR7]], implicit $scc -# GCN: $scc = COPY [[COND1]] +# GCN: S_CMP_LG_U32 [[SGPR4]], [[SGPR5]], implicit-def $scc # GCN-NEXT: S_CSELECT_B32 [[SGPR2]], [[SGPR3]], implicit $scc body: | Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-select.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-select.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-select.mir @@ -18,8 +18,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 @@ -49,8 +47,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64 = S_CSELECT_B64 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B64_]] %0:sgpr(s32) = COPY $sgpr0 @@ -80,8 +76,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64 = S_CSELECT_B64 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B64_]] %0:sgpr(s32) = COPY $sgpr0 @@ -111,8 +105,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64 = S_CSELECT_B64 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B64_]] %0:sgpr(s32) = COPY $sgpr0 @@ -142,8 +134,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64 = S_CSELECT_B64 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B64_]] %0:sgpr(s32) = COPY $sgpr0 @@ -173,8 +163,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64 = S_CSELECT_B64 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B64_]] %0:sgpr(s32) = COPY $sgpr0 @@ -204,8 +192,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY2]], [[COPY3]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY]], [[COPY1]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 @@ -237,8 +223,6 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY2]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 @@ -550,8 +534,6 @@ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[S_XOR_B32_]], [[COPY3]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 @@ -584,8 +566,6 @@ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[COPY3]], [[S_MOV_B32_]], implicit-def $scc ; GCN-NEXT: S_CMP_EQ_U32 [[COPY]], [[COPY1]], implicit-def $scc - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $scc - ; GCN-NEXT: $scc = COPY [[COPY4]] ; GCN-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY2]], [[S_XOR_B32_]], implicit $scc ; GCN-NEXT: S_ENDPGM 0, implicit [[S_CSELECT_B32_]] %0:sgpr(s32) = COPY $sgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1750,45 +1750,41 @@ ; GCN-LABEL: s_lshr_i65: ; GCN: ; %bb.0: ; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GCN-NEXT: s_sub_i32 s10, s3, 64 -; GCN-NEXT: s_sub_i32 s8, 64, s3 +; GCN-NEXT: s_sub_i32 s2, s3, 64 +; GCN-NEXT: s_sub_i32 s10, 64, s3 ; GCN-NEXT: s_cmp_lt_u32 s3, 64 -; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b32 s12, 1, 0 +; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s3 +; GCN-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3 -; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 -; GCN-NEXT: s_cmp_lg_u32 s11, 0 -; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s2 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: s_cmp_lg_u32 s12, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_cmp_lg_u32 s11, 0 ; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_lshr_i65: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 -; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 +; GFX10PLUS-NEXT: s_sub_i32 s2, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s8, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 ; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 -; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX10PLUS-NEXT: s_lshr_b64 s[10:11], s[4:5], s3 ; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s2 ; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[10:11], 0 ; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1724,8 +1724,6 @@ ; GCN-NEXT: s_sub_i32 s6, 64, s3 ; GCN-NEXT: s_cmp_lt_u32 s3, 64 ; GCN-NEXT: s_cselect_b32 s11, 1, 0 -; GCN-NEXT: s_cmp_eq_u32 s3, 0 -; GCN-NEXT: s_cselect_b32 s12, 1, 0 ; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s3 ; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s3 @@ -1734,7 +1732,7 @@ ; GCN-NEXT: s_cmp_lg_u32 s11, 0 ; GCN-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 ; GCN-NEXT: s_cselect_b64 s[4:5], s[6:7], s[8:9] -; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, 0 ; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog ; @@ -1744,8 +1742,6 @@ ; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3 ; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 ; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 ; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 ; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 @@ -1754,7 +1750,7 @@ ; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 ; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i65 %value, %amount Index: llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2661,14 +2661,12 @@ ; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s1, s3 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_cmp_lg_u32 s2, s1 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cmp_lg_u32 s2, s1 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2531,14 +2531,12 @@ ; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_subb_u32 s2, s1, s3 -; GFX6-NEXT: s_and_b32 s1, s2, 0xffff -; GFX6-NEXT: s_cmp_lg_u32 s2, s1 -; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s3 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cmp_lg_u32 s2, s1 ; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] ; GFX6-NEXT: ; return to shader part epilog ; Index: llvm/test/CodeGen/AMDGPU/fptrunc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -229,8 +229,8 @@ ; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0 ; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-SAFE-GISEL-NEXT: s_movk_i32 s5, 0x7e00 +; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, s5, 0x7c00 ; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4 ; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12