diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -146,30 +146,6 @@ return AMDGPU::INSTRUCTION_LIST_END; } -// Wrapper around isInlineConstant that understands special cases when -// instruction types are replaced during operand folding. -static bool isInlineConstantIfFolded(const SIInstrInfo *TII, - const MachineInstr &UseMI, - unsigned OpNo, - const MachineOperand &OpToFold) { - if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) - return true; - - unsigned Opc = UseMI.getOpcode(); - unsigned NewOpc = macToMad(Opc); - if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { - // Special case for mac. Since this is replaced with mad when folded into - // src2, we need to check the legality for the final instruction. - int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - if (static_cast(OpNo) == Src2Idx) { - const MCInstrDesc &MadDesc = TII->get(NewOpc); - return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); - } - } - - return false; -} - // TODO: Add heuristic that the frame index might not fit in the addressing mode // immediate offset to avoid materializing in loops. static bool frameIndexMayFold(const SIInstrInfo *TII, @@ -1267,59 +1243,13 @@ } } - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - for (auto &Use : - make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { - MachineInstr *UseMI = Use.getParent(); - unsigned OpNo = UseMI->getOperandNo(&Use); - - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &Use; - NonInlineUseOpNo = OpNo; - } - } - } - - if (NumLiteralUses == 1) { - MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); - } - } else { - // Folding register. - SmallVector UsesToProcess; - for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) - UsesToProcess.push_back(&Use); - for (auto U : UsesToProcess) { - MachineInstr *UseMI = U->getParent(); - - foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), - FoldList, CopiesToReplace); - } + SmallVector UsesToProcess; + for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) + UsesToProcess.push_back(&Use); + for (auto U : UsesToProcess) { + MachineInstr *UseMI = U->getParent(); + foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, + CopiesToReplace); } if (CopiesToReplace.empty() && FoldList.empty()) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -127,9 +127,8 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xffc0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0 +; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -209,14 +208,12 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff -; GFX8-NEXT: s_mov_b32 s1, 0xffc0 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -243,13 +240,12 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -276,13 +272,12 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -310,15 +305,14 @@ ; ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -349,15 +343,14 @@ ; GFX8-LABEL: s_add_v2i16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -391,15 +384,14 @@ ; GFX8-LABEL: s_add_v2i16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -422,9 +414,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) { ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0x80008000 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s1 @@ -434,26 +425,23 @@ ; ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0x80008000 -; GFX8-NEXT: s_xor_b32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x80008000 -; GFX10-NEXT: s_xor_b32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -967,7 +967,7 @@ ; GFX7-LABEL: uaddo_i16_sv: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s1, v0 @@ -980,7 +980,7 @@ ; GFX8-LABEL: uaddo_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v1, s1, v0 @@ -992,8 +992,7 @@ ; ; GFX9-LABEL: uaddo_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -429,13 +429,12 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog @@ -487,13 +485,12 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_and_b32 s1, s6, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: @@ -630,18 +626,17 @@ ; GFX6-LABEL: s_andn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -673,18 +668,17 @@ ; GFX6-LABEL: s_andn2_v4i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -716,18 +710,17 @@ ; GFX6-LABEL: s_andn2_v4i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s14, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, s14 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s14 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, s14 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, s14 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, s14 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, s14 +; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -794,24 +794,22 @@ ; GFX6-LABEL: s_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s3 ; GFX6-NEXT: s_ashr_i32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s3 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s3 -; GFX8-NEXT: s_ashr_i32 s2, s2, s4 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s3, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_ashr_i32 s2, s2, s3 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff @@ -886,12 +884,12 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: ashr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 @@ -994,45 +992,42 @@ ; GFX6-LABEL: s_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, s4 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s6 ; GFX6-NEXT: s_ashr_i32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s5, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s5 -; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s7, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, s5 -; GFX8-NEXT: s_sext_i32_i16 s8, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, s5 -; GFX8-NEXT: s_ashr_i32 s4, s4, s7 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s6, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s7, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_ashr_i32 s4, s4, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_ashr_i32 s2, s6, s8 +; GFX8-NEXT: s_ashr_i32 s2, s5, s7 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s4, s4, s3 +; GFX8-NEXT: s_and_b32 s3, s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1191,79 +1186,76 @@ ; GFX6-LABEL: s_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s16, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s9 ; GFX6-NEXT: s_ashr_i32 s0, s0, s8 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s10 ; GFX6-NEXT: s_ashr_i32 s3, s3, s11 ; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, s13 ; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s16 -; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, s12 ; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_ashr_i32 s7, s7, s15 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s9, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s9 -; GFX8-NEXT: s_sext_i32_i16 s10, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s9 -; GFX8-NEXT: s_sext_i32_i16 s12, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, s9 -; GFX8-NEXT: s_sext_i32_i16 s13, s4 -; GFX8-NEXT: s_bfe_i32 s4, s4, s9 -; GFX8-NEXT: s_sext_i32_i16 s14, s5 -; GFX8-NEXT: s_bfe_i32 s5, s5, s9 -; GFX8-NEXT: s_sext_i32_i16 s16, s7 -; GFX8-NEXT: s_bfe_i32 s7, s7, s9 -; GFX8-NEXT: s_sext_i32_i16 s11, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, s9 -; GFX8-NEXT: s_sext_i32_i16 s15, s6 -; GFX8-NEXT: s_bfe_i32 s6, s6, s9 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s9, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s12, s4 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s13, s5 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s10, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s14, s6 +; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s10, s14 +; GFX8-NEXT: s_ashr_i32 s4, s9, s13 ; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 -; GFX8-NEXT: s_mov_b32 s7, 0xffff -; GFX8-NEXT: s_ashr_i32 s5, s11, s15 +; GFX8-NEXT: s_sext_i32_i16 s11, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s15, s7 +; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 +; GFX8-NEXT: s_ashr_i32 s5, s10, s14 ; GFX8-NEXT: s_ashr_i32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s4, s7 -; GFX8-NEXT: s_ashr_i32 s8, s8, s13 -; GFX8-NEXT: s_ashr_i32 s6, s12, s16 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_ashr_i32 s8, s8, s12 +; GFX8-NEXT: s_ashr_i32 s6, s11, s15 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 ; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s4, s5, s7 +; GFX8-NEXT: s_and_b32 s4, s5, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s8, s8, s7 +; GFX8-NEXT: s_and_b32 s7, s8, 0xffff ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s4, s6, s7 -; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_or_b32 s0, s0, s7 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -111,9 +111,8 @@ ; ; GFX10-LABEL: s_bswap_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s0, s2 -; GFX10-NEXT: v_perm_b32 v1, 0, s1, s2 +; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s1, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -154,9 +153,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, v0, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 ; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) ret <2 x i32> %bswap @@ -200,9 +198,8 @@ ; ; GFX10-LABEL: s_bswap_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s1, s2 -; GFX10-NEXT: v_perm_b32 v1, 0, s0, s2 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -246,9 +243,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v2, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v2, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i64 @llvm.bswap.i64(i64 %src) @@ -313,11 +309,10 @@ ; ; GFX10-LABEL: s_bswap_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s1, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, s0, s4 -; GFX10-NEXT: v_perm_b32 v2, 0, s3, s4 -; GFX10-NEXT: v_perm_b32 v3, 0, s2, s4 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, s3, 0x10203 +; GFX10-NEXT: v_perm_b32 v3, 0, s2, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -376,11 +371,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v4, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v5, 0, v3, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 -; GFX10-NEXT: v_perm_b32 v3, 0, v2, s4 +; GFX10-NEXT: v_perm_b32 v4, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v5, 0, v3, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 +; GFX10-NEXT: v_perm_b32 v3, 0, v2, 0x10203 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -457,12 +451,11 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-LABEL: s_bswap_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s3, 0x80008 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, s3 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_lshl_b32 s2, s1, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s3 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 @@ -647,9 +640,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v2, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x10203 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i64 %src to i48 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -901,30 +901,29 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v4, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 ; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: @@ -934,23 +933,22 @@ ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v4, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v3, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v5, s4 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: @@ -959,30 +957,29 @@ ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7 +; GFX10-DENORM-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v4, v2 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v5, s4 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: @@ -992,23 +989,22 @@ ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v4, v8 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v3, s4 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v5, s4 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll @@ -113,10 +113,9 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -155,10 +154,9 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -198,10 +196,9 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, s0, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, s0, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -240,10 +237,9 @@ ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, s0, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, s0, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -488,10 +488,9 @@ ; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -511,10 +510,9 @@ ; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y @@ -567,10 +565,9 @@ ; GFX10-NEXT: v_add_f16_e64 v2, v4, -v0 ; GFX10-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e64 v3, v5, -v1 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -590,10 +587,9 @@ ; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v4, -v0 ; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v5, -v1 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -90,9 +90,8 @@ ; GFX10-CONTRACT: ; %bb.0: ; %entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -278,10 +277,9 @@ ; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -301,10 +299,9 @@ ; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -763,29 +763,26 @@ ; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s7, 0x80008 -; GCN-NEXT: s_movk_i32 s2, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s8, s0, s7 -; GCN-NEXT: s_and_b32 s6, s0, s2 -; GCN-NEXT: s_lshl_b32 s8, s8, 8 -; GCN-NEXT: s_or_b32 s6, s6, s8 -; GCN-NEXT: s_mov_b32 s8, 0x80010 -; GCN-NEXT: s_lshr_b32 s3, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s8 +; GCN-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s0, 24 +; GCN-NEXT: s_and_b32 s5, s0, 0xff +; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s6, s0 -; GCN-NEXT: s_lshl_b32 s3, s3, 24 -; GCN-NEXT: s_or_b32 s0, s0, s3 -; GCN-NEXT: s_bfe_u32 s3, s1, s7 -; GCN-NEXT: s_lshr_b32 s5, s1, 24 -; GCN-NEXT: s_and_b32 s2, s1, s2 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s8 -; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_or_b32 s0, s5, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s1, 0xff +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s2, s5, 24 +; GCN-NEXT: s_lshl_b32 s2, s3, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_lshr_b32 s2, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s2, 1 @@ -798,32 +795,29 @@ ; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_lshr_b32 s2, s4, 2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s10, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s10, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s8, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s8 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s8, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: s_cselect_b32 s0, s1, s0 ; GFX10-NEXT: s_and_b32 s1, s4, 3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 @@ -934,7 +928,6 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -942,9 +935,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 @@ -1063,7 +1056,6 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1071,9 +1063,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s6, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s6, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 @@ -1093,37 +1085,34 @@ ; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s6, 0x80008 -; GCN-NEXT: s_movk_i32 s2, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s7, s0, s6 -; GCN-NEXT: s_and_b32 s5, s0, s2 -; GCN-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NEXT: s_or_b32 s5, s5, s7 -; GCN-NEXT: s_mov_b32 s7, 0x80010 -; GCN-NEXT: s_lshr_b32 s3, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s7 +; GCN-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s0, 24 +; GCN-NEXT: s_and_b32 s4, s0, 0xff +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s5, s0 -; GCN-NEXT: s_lshl_b32 s3, s3, 24 -; GCN-NEXT: s_or_b32 s0, s0, s3 -; GCN-NEXT: s_bfe_u32 s3, s1, s6 -; GCN-NEXT: s_lshr_b32 s4, s1, 24 -; GCN-NEXT: s_and_b32 s2, s1, s2 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s7 -; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_or_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s1, 0xff +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s2, s4, 24 +; GCN-NEXT: s_lshl_b32 s2, s3, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -1131,33 +1120,30 @@ ; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s4, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s3, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, 8 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s7, s4 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s5, 24 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s3, s0 ; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo @@ -2089,45 +2075,42 @@ ; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s11, 0x80008 -; GCN-NEXT: s_movk_i32 s9, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s12, s0, s11 -; GCN-NEXT: s_and_b32 s10, s0, s9 -; GCN-NEXT: s_lshl_b32 s12, s12, 8 -; GCN-NEXT: s_or_b32 s10, s10, s12 -; GCN-NEXT: s_mov_b32 s12, 0x80010 +; GCN-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s5, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s12 +; GCN-NEXT: s_and_b32 s9, s0, 0xff +; GCN-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s9, s9, s10 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s10, s0 +; GCN-NEXT: s_or_b32 s0, s9, s0 ; GCN-NEXT: s_lshl_b32 s5, s5, 24 -; GCN-NEXT: s_bfe_u32 s10, s1, s11 +; GCN-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s6, s1, 24 ; GCN-NEXT: s_or_b32 s0, s0, s5 -; GCN-NEXT: s_and_b32 s5, s1, s9 -; GCN-NEXT: s_lshl_b32 s10, s10, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s12 -; GCN-NEXT: s_or_b32 s5, s5, s10 +; GCN-NEXT: s_and_b32 s5, s1, 0xff +; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s5, s5, s9 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s5, s1 ; GCN-NEXT: s_lshl_b32 s5, s6, 24 -; GCN-NEXT: s_bfe_u32 s6, s2, s11 +; GCN-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GCN-NEXT: s_lshr_b32 s7, s2, 24 ; GCN-NEXT: s_or_b32 s1, s1, s5 -; GCN-NEXT: s_and_b32 s5, s2, s9 +; GCN-NEXT: s_and_b32 s5, s2, 0xff ; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_bfe_u32 s2, s2, s12 +; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s2, s5, s2 ; GCN-NEXT: s_lshl_b32 s5, s7, 24 -; GCN-NEXT: s_bfe_u32 s6, s3, s11 +; GCN-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GCN-NEXT: s_lshr_b32 s8, s3, 24 ; GCN-NEXT: s_or_b32 s2, s2, s5 -; GCN-NEXT: s_and_b32 s5, s3, s9 +; GCN-NEXT: s_and_b32 s5, s3, 0xff ; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_bfe_u32 s3, s3, s12 +; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -2148,50 +2131,47 @@ ; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshl_b32 s17, s17, 8 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_and_b32 s5, s3, s5 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 +; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s9, s0, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s8, s16, s17 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_or_b32 s10, s11, s12 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s10, s1 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s9, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_or_b32 s11, s13, s14 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s5, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s11, s2 +; GFX10-NEXT: s_or_b32 s6, s7, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s8, s10, 24 -; GFX10-NEXT: s_or_b32 s3, s5, s3 -; GFX10-NEXT: s_lshl_b32 s5, s11, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_or_b32 s3, s6, s3 +; GFX10-NEXT: s_lshl_b32 s5, s8, 24 ; GFX10-NEXT: s_lshr_b32 s6, s4, 2 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s8 ; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b32 s0, s1, s0 @@ -2371,37 +2351,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_mov_b32_e32 v6, 16 +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7 -; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v12, v7 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9 -; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX10-NEXT: v_or3_b32 v2, v2, v14, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v3, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 ; GFX10-NEXT: v_or3_b32 v1, v4, v3, v5 @@ -2583,43 +2561,41 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: v_mov_b32_e32 v0, 0xff -; GFX10-NEXT: v_mov_b32_e32 v7, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, v0, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GFX10-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -2633,47 +2609,46 @@ ; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s10, 0x80008 -; GCN-NEXT: s_movk_i32 s8, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s11, s0, s10 -; GCN-NEXT: s_and_b32 s9, s0, s8 -; GCN-NEXT: s_lshl_b32 s11, s11, 8 -; GCN-NEXT: s_or_b32 s9, s9, s11 -; GCN-NEXT: s_mov_b32 s11, 0x80010 +; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s4, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s11 +; GCN-NEXT: s_and_b32 s8, s0, 0xff +; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s8, s8, s9 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s9, s0 +; GCN-NEXT: s_or_b32 s0, s8, s0 ; GCN-NEXT: s_lshl_b32 s4, s4, 24 -; GCN-NEXT: s_bfe_u32 s9, s1, s10 +; GCN-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s5, s1, 24 ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: s_and_b32 s4, s1, s8 -; GCN-NEXT: s_lshl_b32 s9, s9, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s11 -; GCN-NEXT: s_or_b32 s4, s4, s9 +; GCN-NEXT: s_and_b32 s4, s1, 0xff +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s4, s4, s8 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s4, s1 ; GCN-NEXT: s_lshl_b32 s4, s5, 24 -; GCN-NEXT: s_bfe_u32 s5, s2, s10 +; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GCN-NEXT: s_lshr_b32 s6, s2, 24 ; GCN-NEXT: s_or_b32 s1, s1, s4 -; GCN-NEXT: s_and_b32 s4, s2, s8 +; GCN-NEXT: s_and_b32 s4, s2, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s2, s2, s11 +; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s2, s4, s2 ; GCN-NEXT: s_lshl_b32 s4, s6, 24 -; GCN-NEXT: s_bfe_u32 s5, s3, s10 +; GCN-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GCN-NEXT: s_lshr_b32 s7, s3, 24 ; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_and_b32 s4, s3, s8 +; GCN-NEXT: s_and_b32 s4, s3, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s3, s3, s11 +; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s4, s3 @@ -2687,9 +2662,7 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -2697,58 +2670,55 @@ ; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s5, 0x80008 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_mov_b32 s6, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s14, s1, s5 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s4 -; GFX10-NEXT: s_and_b32 s13, s1, s4 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s12, s12, 8 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s11, s11, s12 -; GFX10-NEXT: s_or_b32 s12, s13, s14 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s1, s12, s1 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_or_b32 s9, s10, s11 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_or_b32 s1, s1, s5 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_bfe_u32 s16, s2, s5 +; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_and_b32 s15, s2, s4 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s7, s15, s16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 24 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_bfe_u32 s5, s3, s5 +; GFX10-NEXT: s_or_b32 s10, s12, s13 +; GFX10-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: s_or_b32 s2, s7, s2 -; GFX10-NEXT: s_lshl_b32 s7, s9, 24 -; GFX10-NEXT: s_and_b32 s4, s3, s4 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s4, s6, 24 +; GFX10-NEXT: s_and_b32 s6, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_bfe_u32 s1, s3, s6 -; GFX10-NEXT: s_or_b32 s2, s2, s7 -; GFX10-NEXT: s_lshr_b32 s10, s3, 24 -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s7, s3, 24 +; GFX10-NEXT: s_or_b32 s3, s6, s5 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 ; GFX10-NEXT: s_or_b32 s0, s3, s1 -; GFX10-NEXT: s_lshl_b32 s1, s10, 24 +; GFX10-NEXT: s_lshl_b32 s1, s7, 24 ; GFX10-NEXT: s_or_b32 s3, s0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -901,19 +901,17 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 @@ -1334,19 +1332,17 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-IEEE-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s6, |v0|, s4 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s4, |v1|, s4 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, s5, s6 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, s5, s4 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0| +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,19 +1488,17 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -96,12 +96,11 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 4 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -156,12 +155,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, s32 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -327,14 +325,13 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x104 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -396,16 +393,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -548,14 +545,13 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x4004 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -619,16 +615,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -325,10 +325,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s2, 0x80000000 -; GFX10-NEXT: v_sub_f32_e32 v1, s2, v1 -; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| ; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -447,11 +445,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s2, 0x80000000 -; GFX10-NEXT: v_sub_f32_e64 v1, s2, |v1| -; GFX10-NEXT: v_sub_f32_e64 v2, s2, |v2| -; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1| +; GFX10-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2| +; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -473,9 +473,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x80008000 -; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX10-NEXT: v_log_f16_e32 v2, v0 ; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -9,8 +9,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s2, s2, 0x7f ; GFX6-NEXT: s_movk_i32 s3, 0x7f -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -39,9 +39,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s2, s2, 0x7f +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: s_movk_i32 s3, 0x7f -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -71,9 +71,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7f +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: s_movk_i32 s3, 0x7f -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -102,11 +102,10 @@ ; GFX10-LABEL: s_fshl_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 -; GFX10-NEXT: s_movk_i32 s3, 0x7f -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_and_b32 s2, s2, 0x7f +; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -123,8 +122,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -250,10 +249,9 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -541,68 +539,65 @@ ; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: s_lshl_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s4 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_or_b32 s1, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 +; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s4, s6 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s6 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 +; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s4, s6 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_and_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -610,28 +605,27 @@ ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s4, s4, s6 -; GFX10-NEXT: s_and_b32 s7, s2, 7 -; GFX10-NEXT: s_and_b32 s1, s1, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s6, s2, 7 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_and_b32 s7, s5, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshr_b32 s4, s4, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, s6 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -729,20 +723,20 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 @@ -785,22 +779,21 @@ ; GFX6-NEXT: s_bfe_u32 s4, s1, 0x80010 ; GFX6-NEXT: s_andn2_b32 s6, 7, s7 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_movk_i32 s10, 0xff ; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 -; GFX6-NEXT: s_and_b32 s2, s2, s10 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_or_b32 s1, s4, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s3, s10 +; GFX6-NEXT: s_and_b32 s2, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s10 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -808,11 +801,10 @@ ; ; GFX8-LABEL: s_fshl_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s13, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 @@ -828,7 +820,7 @@ ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 @@ -836,37 +828,36 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshr_b32 s4, s4, 1 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 ; GFX8-NEXT: s_lshr_b32 s5, s8, 1 -; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s13, 0xff ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 @@ -882,7 +873,7 @@ ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 @@ -890,79 +881,78 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 ; GFX9-NEXT: s_lshr_b32 s5, s8, 1 -; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshr_b32 s4, s5, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_and_b32 s1, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s11, 0xff ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s12, s2, 24 -; GFX10-NEXT: s_and_b32 s13, s2, 7 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, s11 +; GFX10-NEXT: s_and_b32 s2, s6, 0xff ; GFX10-NEXT: s_and_b32 s6, s9, 7 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s13 +; GFX10-NEXT: s_lshl_b32 s0, s0, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s2, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, s11 +; GFX10-NEXT: s_and_b32 s2, s7, 0xff ; GFX10-NEXT: s_and_b32 s3, s10, 7 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s12, 7 -; GFX10-NEXT: s_andn2_b32 s6, 7, s12 +; GFX10-NEXT: s_and_b32 s4, s11, 7 +; GFX10-NEXT: s_andn2_b32 s6, 7, s11 ; GFX10-NEXT: s_lshr_b32 s7, s8, 1 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s7, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, s11 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s11 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s11 +; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1136,51 +1126,50 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v11, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 ; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v10, v12 +; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1199,8 +1188,8 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX6-NEXT: s_mov_b32 s3, 0xffffff -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1230,8 +1219,8 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX8-NEXT: s_mov_b32 s3, 0xffffff -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1261,8 +1250,8 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX9-NEXT: s_mov_b32 s3, 0xffffff -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1289,9 +1278,8 @@ ; GFX10-LABEL: s_fshl_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x170001 -; GFX10-NEXT: s_and_b32 s2, s2, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1308,8 +1296,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1433,11 +1421,10 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1449,25 +1436,23 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xff -; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 -; GFX6-NEXT: s_and_b32 s10, s0, s9 -; GFX6-NEXT: s_bfe_u32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1476,20 +1461,20 @@ ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 -; GFX6-NEXT: s_and_b32 s10, s2, s9 -; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s10, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s10, s2 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1498,10 +1483,10 @@ ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 ; GFX6-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NEXT: s_and_b32 s10, s4, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_and_b32 s10, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s10, s4 @@ -1515,14 +1500,14 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: s_lshr_b32 s8, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX6-NEXT: s_and_b32 s5, s5, s9 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1555,6 +1540,7 @@ ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1579,25 +1565,24 @@ ; GFX8-LABEL: s_fshl_v2i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: s_movk_i32 s10, 0xff -; GFX8-NEXT: s_and_b32 s6, s6, s10 -; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_and_b32 s0, s0, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1606,23 +1591,23 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1630,13 +1615,13 @@ ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1649,14 +1634,14 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1698,7 +1683,7 @@ ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1712,27 +1697,26 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: s_movk_i32 s12, 0xff -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_bfe_u32 s13, 8, 0x100000 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_and_b32 s0, s0, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1740,24 +1724,24 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 8 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s3, s10, s3 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1765,14 +1749,14 @@ ; GFX9-NEXT: s_or_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s7, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 @@ -1780,10 +1764,10 @@ ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 @@ -1822,10 +1806,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -1840,120 +1825,117 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s10, s1, 8 -; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s1, s1, s9 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s11 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshr_b32 s8, s4, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX10-NEXT: s_lshr_b32 s7, s4, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_and_b32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s6, s6, s11 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s12, s4, 24 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s10, s9 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, s10 +; GFX10-NEXT: s_lshr_b32 s13, s5, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s11, 0xff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s5, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_and_b32 s8, s10, s9 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_lshl_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s8, s13, s9 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s13, 0xff ; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s7 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s8 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_and_b32 s12, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_and_b32 s10, s10, s9 -; GFX10-NEXT: s_or_b32 s8, s12, s8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s10, 0x100000 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-NEXT: s_and_b32 s5, s9, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s8, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s9 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s11 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s8, s9 -; GFX10-NEXT: s_mov_b32 s5, 0xffffff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: s_lshr_b32 s2, s3, 1 ; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v4, s2 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 @@ -2141,12 +2123,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 ; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2177,12 +2158,12 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 -; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 @@ -3176,19 +3157,18 @@ ; GFX6-LABEL: s_fshl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s6, s4, 15 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, s6 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, s6 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3229,22 +3209,20 @@ ; ; GFX9-LABEL: s_fshl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000f -; GFX9-NEXT: s_and_b32 s4, s2, s3 -; GFX9-NEXT: s_andn2_b32 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s5 -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s3, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_andn2_b32 s2, 0xf000f, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, s4 @@ -3254,22 +3232,20 @@ ; ; GFX10-LABEL: s_fshl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_mov_b32 s3, 0xf000f -; GFX10-NEXT: s_and_b32 s7, s1, s5 +; GFX10-NEXT: s_and_b32 s6, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_lshr_b32 s7, s7, 0x10001 +; GFX10-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s6, s6, 0x10001 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_and_b32 s4, s2, s3 -; GFX10-NEXT: s_andn2_b32 s2, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s7, s1 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -3347,10 +3323,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3420,19 +3395,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_mov_b32 s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, s0 +; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_bfe_u32 s0, s3, s0 +; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3488,13 +3462,12 @@ ; GFX10-LABEL: v_fshl_v2i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshr_b32 s2, s1, 16 ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 0x10001 -; GFX10-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf000f, v1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s1 @@ -3556,13 +3529,12 @@ ; ; GFX9-LABEL: v_fshl_v2i16_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s4 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, s1, v0 @@ -3571,15 +3543,14 @@ ; ; GFX10-LABEL: v_fshl_v2i16_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xf000f ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_and_b32 s3, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, s1, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s1, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3592,19 +3563,18 @@ ; GFX6-LABEL: v_fshl_v2i16_vss: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s0, s0, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, s4 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3642,18 +3612,16 @@ ; ; GFX9-LABEL: v_fshl_v2i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s0, s0, 0x10001 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: s_lshr_b32 s1, s2, s3 @@ -3663,18 +3631,16 @@ ; ; GFX10-LABEL: v_fshl_v2i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_and_b32 s5, s0, s3 +; GFX10-NEXT: s_and_b32 s3, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s5, 0x10001 +; GFX10-NEXT: s_lshr_b32 s3, s3, 0x10001 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s5, s0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s1, s2, s3 @@ -3704,19 +3670,18 @@ ; GFX6-LABEL: s_fshl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s12, s8, 15 -; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_lshl_b32 s0, s0, s12 -; GFX6-NEXT: s_mov_b32 s12, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, s12 +; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s12 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s9 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, s12 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3724,7 +3689,7 @@ ; GFX6-NEXT: s_andn2_b32 s5, 15, s10 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_bfe_u32 s4, s6, s12 +; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s2, s2, s4 @@ -3732,7 +3697,7 @@ ; GFX6-NEXT: s_andn2_b32 s5, 15, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s7, s12 +; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -3801,42 +3766,39 @@ ; ; GFX9-LABEL: s_fshl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s6, 0xf000f -; GFX9-NEXT: s_and_b32 s7, s4, s6 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s7, s9, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_mov_b32 s8, 0x10001 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: s_andn2_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s6, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 0x10001 +; GFX9-NEXT: s_lshr_b32 s6, s6, 1 +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s7, s10 +; GFX9-NEXT: s_lshr_b32 s4, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s2, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 0x10001 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s3, s3, s5 @@ -3846,40 +3808,37 @@ ; ; GFX10-LABEL: s_fshl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s10, 0xffff -; GFX10-NEXT: s_mov_b32 s6, 0xf000f -; GFX10-NEXT: s_mov_b32 s8, 0x10001 -; GFX10-NEXT: s_and_b32 s12, s2, s10 +; GFX10-NEXT: s_and_b32 s9, s2, 0xffff ; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_and_b32 s7, s4, s6 -; GFX10-NEXT: s_lshr_b32 s12, s12, s8 +; GFX10-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX10-NEXT: s_lshr_b32 s9, s9, 0x10001 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_lshr_b32 s11, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s7, s9, s11 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s10 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s6, s7, s8 +; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s4, s9, s11 -; GFX10-NEXT: s_and_b32 s9, s3, s10 +; GFX10-NEXT: s_lshr_b32 s4, s7, s8 +; GFX10-NEXT: s_and_b32 s8, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s4, s5, s6 -; GFX10-NEXT: s_lshr_b32 s8, s9, s8 +; GFX10-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX10-NEXT: s_lshr_b32 s8, s8, 0x10001 ; GFX10-NEXT: s_lshr_b32 s3, s3, 1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_andn2_b32 s5, s6, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s5, 0xf000f, s5 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s4, s6, s7 ; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s7, s5, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s5, s6, s7 @@ -4003,16 +3962,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -4893,14 +4851,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x7f -; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_and_b32_e32 v18, s4, v8 +; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 31, v6 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 @@ -5104,44 +5061,43 @@ ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s9, 0x7f -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v12, s9, v0 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: v_and_b32_e32 v13, s9, v4 +; GFX10-NEXT: s_lshl_b32 s9, s6, 31 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[6:7] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 @@ -6343,14 +6299,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s7, 0x7f -; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX10-NEXT: v_lshlrev_b32_e32 v21, 31, v10 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v9, v9, v21 ; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 @@ -6382,7 +6337,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 -; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 @@ -6391,7 +6346,7 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX10-NEXT: v_and_b32_e32 v25, s7, v3 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v3 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -9,12 +9,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s2, s2, 0x7f ; GFX6-NEXT: s_movk_i32 s3, 0x7f -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -40,12 +40,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_movk_i32 s3, 0x7f -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -71,12 +71,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_movk_i32 s3, 0x7f -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -101,10 +101,9 @@ ; GFX10-LABEL: s_fshr_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 -; GFX10-NEXT: s_movk_i32 s3, 0x7f +; GFX10-NEXT: s_and_b32 s2, s2, 0x7f ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -121,8 +120,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 @@ -249,12 +248,11 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -530,10 +528,9 @@ ; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: s_and_b32 s5, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s6 +; GFX6-NEXT: s_and_b32 s2, s1, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s2, s5 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 7 @@ -543,8 +540,8 @@ ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -552,28 +549,27 @@ ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 7, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s2 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_andn2_b32 s5, 7, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, s5 -; GFX8-NEXT: s_lshr_b32 s1, s4, s1 -; GFX8-NEXT: s_or_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -581,28 +577,27 @@ ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 7, s5 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s4, s4, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_andn2_b32 s5, 7, s5 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s3, s5 -; GFX9-NEXT: s_lshr_b32 s1, s4, s1 -; GFX9-NEXT: s_or_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s3, s1 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -610,14 +605,13 @@ ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_movk_i32 s7, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 @@ -629,9 +623,9 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, s7 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -729,18 +723,18 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 @@ -768,10 +762,9 @@ ; GFX6-NEXT: s_lshr_b32 s9, s2, 24 ; GFX6-NEXT: s_and_b32 s10, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_movk_i32 s11, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s11 +; GFX6-NEXT: s_and_b32 s2, s1, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s7, 7 @@ -792,24 +785,23 @@ ; GFX6-NEXT: s_and_b32 s3, s9, 7 ; GFX6-NEXT: s_andn2_b32 s4, 7, s9 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_and_b32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s3, s6, s3 -; GFX6-NEXT: s_and_b32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s1, s1, s11 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s11 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s13, 0xff ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -822,14 +814,14 @@ ; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -837,33 +829,32 @@ ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshl_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s3, s4, s3 -; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: s_and_b32 s3, s11, 7 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s5, s5, 1 -; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s8, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s13, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -876,14 +867,14 @@ ; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -891,26 +882,26 @@ ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 -; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshr_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s2, s3, s2 ; GFX9-NEXT: s_and_b32 s3, s11, 7 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 -; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s3, s8, s3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_and_b32 s1, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -918,7 +909,6 @@ ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_movk_i32 s13, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 @@ -929,9 +919,9 @@ ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s6, s6, s13 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 @@ -941,7 +931,7 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_and_b32 s6, s7, s13 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s10, 7 @@ -956,14 +946,14 @@ ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, s13 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s13 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s13 +; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1136,52 +1126,51 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 -; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 +; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1200,12 +1189,12 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX6-NEXT: s_mov_b32 s3, 0xffffff -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1232,12 +1221,12 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX8-NEXT: s_mov_b32 s3, 0xffffff -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -1264,11 +1253,11 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX9-NEXT: s_mov_b32 s3, 0xffffff -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1293,10 +1282,9 @@ ; GFX10-LABEL: s_fshr_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1313,8 +1301,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1425,6 +1413,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 @@ -1432,9 +1421,7 @@ ; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 @@ -1444,8 +1431,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1458,38 +1445,36 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xff -; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_and_b32 s10, s0, s9 -; GFX6-NEXT: s_bfe_u32 s0, s0, s11 -; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 -; GFX6-NEXT: s_and_b32 s7, s8, s9 +; GFX6-NEXT: s_and_b32 s7, s8, 0xff ; GFX6-NEXT: s_lshr_b32 s8, s2, 16 ; GFX6-NEXT: s_lshr_b32 s10, s2, 24 -; GFX6-NEXT: s_and_b32 s13, s2, s9 -; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s8, s8, s9 -; GFX6-NEXT: s_or_b32 s2, s13, s2 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff +; GFX6-NEXT: s_or_b32 s2, s12, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshr_b32 s12, s3, 8 +; GFX6-NEXT: s_lshr_b32 s11, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: s_and_b32 s8, s11, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s10, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1498,13 +1483,13 @@ ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 ; GFX6-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NEXT: s_and_b32 s13, s4, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_and_b32 s12, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NEXT: s_and_b32 s8, s8, s9 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s13, s4 +; GFX6-NEXT: s_or_b32 s4, s12, s4 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1513,16 +1498,16 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NEXT: s_lshr_b32 s11, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX6-NEXT: s_and_b32 s5, s5, s9 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: s_and_b32 s8, s11, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s10, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1534,7 +1519,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 @@ -1564,6 +1549,7 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1589,55 +1575,54 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 -; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 -; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 -; GFX8-NEXT: s_and_b32 s7, s9, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff +; GFX8-NEXT: s_and_b32 s7, s9, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, s10 -; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX8-NEXT: s_lshr_b32 s13, s3, 8 +; GFX8-NEXT: s_lshr_b32 s12, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 -; GFX8-NEXT: s_and_b32 s8, s13, s10 -; GFX8-NEXT: s_or_b32 s3, s12, s3 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff +; GFX8-NEXT: s_or_b32 s3, s11, s3 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s12, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, s10 -; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1648,18 +1633,18 @@ ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: s_lshr_b32 s13, s5, 8 +; GFX8-NEXT: s_lshr_b32 s12, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_or_b32 s5, s12, s5 +; GFX8-NEXT: s_or_b32 s5, s11, s5 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 @@ -1707,7 +1692,7 @@ ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1721,45 +1706,44 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s13, 8, 0x100000 -; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_and_b32 s7, s7, s12 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 -; GFX9-NEXT: s_and_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 -; GFX9-NEXT: s_and_b32 s9, s11, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff +; GFX9-NEXT: s_and_b32 s9, s11, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s14, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, s12 -; GFX9-NEXT: s_lshl_b32 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_and_b32 s10, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s15, s3, 8 +; GFX9-NEXT: s_lshr_b32 s14, s3, 8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 -; GFX9-NEXT: s_and_b32 s10, s15, s12 -; GFX9-NEXT: s_or_b32 s3, s14, s3 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s3, s13, s3 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -1767,25 +1751,25 @@ ; GFX9-NEXT: s_or_b32 s3, s3, s10 ; GFX9-NEXT: s_lshr_b32 s10, s4, 8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s14, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: s_lshl_b32 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s10, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s15, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, s12 +; GFX9-NEXT: s_lshr_b32 s14, s5, 8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s13 -; GFX9-NEXT: s_and_b32 s10, s15, s12 -; GFX9-NEXT: s_or_b32 s5, s14, s5 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s5, s13, s5 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -1831,10 +1815,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -1849,122 +1834,119 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s4, 8 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s13, s4, 16 -; GFX10-NEXT: s_and_b32 s12, s12, s9 -; GFX10-NEXT: s_lshr_b32 s14, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, s10 -; GFX10-NEXT: s_and_b32 s13, s13, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_bfe_u32 s12, s13, 0x100000 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_lshr_b32 s8, s4, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_lshr_b32 s15, s5, 8 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s12, s15, s9 -; GFX10-NEXT: s_or_b32 s5, s14, s5 -; GFX10-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: s_lshr_b32 s11, s1, 8 +; GFX10-NEXT: s_lshr_b32 s11, s4, 24 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s8, s9, 0xff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_or_b32 s5, s5, s12 -; GFX10-NEXT: s_and_b32 s1, s1, s9 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s5, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s9 +; GFX10-NEXT: s_and_b32 s8, s12, 0xff +; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s6, s6, s10 -; GFX10-NEXT: s_and_b32 s8, s8, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s11, s9 -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_lshr_b32 s13, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_lshr_b32 s12, s3, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s8, s11, s9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s9 ; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s3, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_and_b32 s5, s12, 0xff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_and_b32 s5, s12, s9 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_or_b32 s3, s13, s3 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s4, s6, 17 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s5, s6, 17 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX10-NEXT: s_or_b32 s0, s2, s1 ; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 @@ -2156,14 +2138,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2194,12 +2175,12 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 -; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v4, v7, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 @@ -3032,25 +3013,24 @@ ; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 ; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX6-NEXT: s_mov_b32 s6, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_bfe_u32 s7, s2, s6 -; GFX6-NEXT: s_bfe_u32 s8, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 -; GFX6-NEXT: s_bfe_u32 s5, s3, s6 -; GFX6-NEXT: s_lshr_b32 s7, s7, s8 -; GFX6-NEXT: s_lshr_b32 s5, s5, s8 +; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s7 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s7 +; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: s_and_b32 s7, s4, 15 +; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s2, s6 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s7 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 @@ -3058,7 +3038,7 @@ ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, s6 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3111,45 +3091,43 @@ ; ; GFX9-LABEL: s_fshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000f -; GFX9-NEXT: s_and_b32 s4, s2, s3 -; GFX9-NEXT: s_andn2_b32 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX9-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, s5 +; GFX9-NEXT: s_lshl_b32 s2, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0xf000f +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s5, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_andn2_b32 s2, s3, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s4, s2, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s2 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, s4 +; GFX10-NEXT: s_lshl_b32 s2, s3, s5 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s5, 16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s3, s3, s4 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10-NEXT: s_lshr_b32 s3, s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3252,10 +3230,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3326,35 +3303,34 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_mov_b32 s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s6, s2, s5 -; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_bfe_u32 s0, s2, s5 +; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s3, s5 +; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s4, s7 +; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_bfe_u32 s0, s3, s5 +; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3421,12 +3397,11 @@ ; GFX10-LABEL: v_fshr_v2i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX10-NEXT: s_lshl_b32 s2, s3, 1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf000f, v1 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 @@ -3514,36 +3489,34 @@ ; ; GFX9-LABEL: v_fshr_v2i16_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX9-NEXT: s_lshl_b32 s2, s2, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s3, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_v2i16_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_and_b32 s4, s1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: s_and_b32 s3, s1, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, s3, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3559,25 +3532,24 @@ ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: s_mov_b32 s4, 0xf0001 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_bfe_u32 s5, s0, s4 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s0, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 -; GFX6-NEXT: s_bfe_u32 s3, s1, s4 -; GFX6-NEXT: s_lshr_b32 s5, s5, s6 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_and_b32 s5, s2, 15 +; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 @@ -3585,7 +3557,7 @@ ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, s4 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3635,32 +3607,30 @@ ; ; GFX9-LABEL: v_fshr_v2i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_v2i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_and_b32 s3, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s1, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, s3 -; GFX10-NEXT: s_lshr_b32 s1, s2, s4 +; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3686,41 +3656,39 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s12, 0xffff ; GFX6-NEXT: s_lshl_b32 s9, s9, 16 -; GFX6-NEXT: s_and_b32 s8, s8, s12 +; GFX6-NEXT: s_and_b32 s8, s8, 0xffff ; GFX6-NEXT: s_or_b32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 -; GFX6-NEXT: s_and_b32 s10, s10, s12 -; GFX6-NEXT: s_mov_b32 s11, 0xf0001 +; GFX6-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NEXT: s_or_b32 s9, s9, s10 ; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s12, s4, s11 -; GFX6-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s11, s4, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s12, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s10 -; GFX6-NEXT: s_lshr_b32 s12, s12, s13 -; GFX6-NEXT: s_or_b32 s0, s0, s12 -; GFX6-NEXT: s_bfe_u32 s12, s5, s11 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 +; GFX6-NEXT: s_or_b32 s0, s0, s11 +; GFX6-NEXT: s_bfe_u32 s11, s5, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s12, s12, s13 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s12 +; GFX6-NEXT: s_or_b32 s1, s1, s11 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s12, s8, 16 -; GFX6-NEXT: s_and_b32 s14, s8, 15 +; GFX6-NEXT: s_lshr_b32 s11, s8, 16 +; GFX6-NEXT: s_and_b32 s13, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s14, s14, 0x100000 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s13 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s12, 15 +; GFX6-NEXT: s_and_b32 s4, s11, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_andn2_b32 s8, 15, s12 +; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, s11 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3729,12 +3697,12 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, s10 -; GFX6-NEXT: s_bfe_u32 s2, s6, s11 -; GFX6-NEXT: s_lshr_b32 s2, s2, s13 +; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s2, s2, s12 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, s10 -; GFX6-NEXT: s_bfe_u32 s3, s7, s11 -; GFX6-NEXT: s_lshr_b32 s3, s3, s13 +; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s3, s3, s12 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 @@ -3743,7 +3711,7 @@ ; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, s11 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 ; GFX6-NEXT: s_lshr_b32 s3, s3, s5 @@ -3752,7 +3720,7 @@ ; GFX6-NEXT: s_andn2_b32 s5, 15, s6 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, s11 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 @@ -3840,31 +3808,28 @@ ; ; GFX9-LABEL: s_fshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, 0x10001 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_mov_b32 s6, 0xf000f -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 -; GFX9-NEXT: s_lshl_b32 s9, s9, 1 -; GFX9-NEXT: s_and_b32 s7, s4, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s7, s7, 1 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s4, s9, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, s7 -; GFX9-NEXT: s_lshr_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s8 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 @@ -3873,7 +3838,7 @@ ; GFX9-NEXT: s_lshl_b32 s4, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s4, s5 @@ -3883,41 +3848,38 @@ ; ; GFX10-LABEL: s_fshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s7, 0x10001 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_mov_b32 s6, 0xf000f -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s8, s8, 1 -; GFX10-NEXT: s_and_b32 s9, s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX10-NEXT: s_andn2_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, s10 -; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s6, s8 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s7, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x10001 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s7, s5, s6 +; GFX10-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10-NEXT: s_lshr_b32 s6, s6, s8 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 s4, s6, s5 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 0xf000f ; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s8 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s4, s5, s6 +; GFX10-NEXT: s_lshl_b32 s4, s5, s7 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 -; GFX10-NEXT: s_lshr_b32 s9, s10, s11 -; GFX10-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_lshr_b32 s7, s6, 16 +; GFX10-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s5, s5, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: s_or_b32 s0, s0, s2 @@ -4090,16 +4052,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -5008,35 +4969,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: s_movk_i32 s4, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 -; GFX10-NEXT: v_and_b32_e32 v18, s4, v9 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v19 +; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v21, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 ; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v20, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo @@ -5219,36 +5179,35 @@ ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_movk_i32 s10, 0x7f +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_and_b32_e32 v13, s10, v0 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: v_and_b32_e32 v12, s10, v1 ; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v12, s[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo @@ -6464,11 +6423,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 -; GFX10-NEXT: s_movk_i32 s5, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v26, s5, v16 +; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v25, s5, v17 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 @@ -6502,14 +6460,14 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v25, s5, v16 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX10-NEXT: v_and_b32_e32 v23, s5, v20 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v3, s4 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -9,14 +9,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s5, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s3, s4, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -27,14 +26,13 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s5, 1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s3, s4, s2 -; GFX8-NEXT: s_lshl_b32 s3, s3, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -45,14 +43,13 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s5, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s3, s4, s2 -; GFX7-NEXT: s_lshl_b32 s3, s3, s1 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s0, s0, s3 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -63,15 +60,14 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_and_b32 s3, s4, s2 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -87,13 +83,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -104,36 +99,34 @@ ; GFX8-LABEL: insertelement_v_v2i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s1, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s1, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -141,13 +134,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v2, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s3, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_and_b32 s1, s2, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 @@ -165,9 +157,8 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -182,9 +173,8 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 @@ -198,11 +188,10 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -215,11 +204,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_andn2_b32 s0, s0, s2 @@ -237,9 +225,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s2, s4, s1 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 @@ -254,9 +242,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s2, s4, s1 +; GFX8-NEXT: s_mov_b32 s1, 0xffff +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -272,11 +260,10 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s2, s4, s1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_and_b32 s1, s4, 0xffff +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 @@ -290,10 +277,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 -; GFX10-NEXT: s_and_b32 s1, s4, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -364,9 +350,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -386,9 +371,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 @@ -402,9 +387,9 @@ ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 @@ -420,12 +405,11 @@ ; GFX7-LABEL: insertelement_v_v2i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 @@ -439,10 +423,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -463,9 +446,8 @@ ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -477,11 +459,10 @@ ; GFX8-LABEL: insertelement_v_v2i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -495,13 +476,12 @@ ; GFX7-LABEL: insertelement_v_v2i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s2, 1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -514,12 +494,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 @@ -587,9 +566,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -665,22 +643,21 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s1, s3, 1 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_lshr_b32 s0, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v4, v5, s0, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_and_or_b32 v4, v5, s1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -689,22 +666,21 @@ ; GFX8-LABEL: insertelement_v_v4i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s3, 1 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s3, s3, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -713,22 +689,21 @@ ; GFX7-LABEL: insertelement_v_v4i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s1, s3, 1 -; GFX7-NEXT: s_and_b32 s3, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_and_b32 s1, s3, 1 +; GFX7-NEXT: s_lshr_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -737,23 +712,22 @@ ; GFX10-LABEL: insertelement_v_v4i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s1, s3, 1 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_lshl_b32 s3, s3, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_lshr_b32 s0, s3, 1 +; GFX10-NEXT: s_and_b32 s1, s3, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr @@ -768,18 +742,17 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s5, s5, s4 +; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -794,21 +767,20 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s2, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -820,18 +792,17 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s2, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s3, s1, s0 ; GFX7-NEXT: s_and_b32 s4, s4, 1 ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 ; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 @@ -845,9 +816,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 @@ -855,7 +825,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s5, s5, s4 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -877,13 +847,13 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_and_b32 s3, s4, 0xffff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -904,13 +874,13 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s3, s4, s2 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_and_b32 s3, s4, 0xffff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -932,16 +902,15 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s3, s4, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshl_b32_e32 v3, s3, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 @@ -960,12 +929,11 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 @@ -1073,10 +1041,9 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,10 +1069,10 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 @@ -1125,10 +1092,10 @@ ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 @@ -1149,13 +1116,12 @@ ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX7-NEXT: v_lshl_b32_e32 v6, s1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 @@ -1175,11 +1141,10 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 @@ -1202,20 +1167,19 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s1, s2, 1 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s2, s2, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, s0, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_and_or_b32 v2, v5, s1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -1224,22 +1188,21 @@ ; GFX8-LABEL: insertelement_v_v4i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s2, 1 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s2, s2, 4 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1248,22 +1211,21 @@ ; GFX7-LABEL: insertelement_v_v4i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s1, s2, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1272,18 +1234,17 @@ ; GFX10-LABEL: insertelement_v_v4i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s1, s2, 1 ; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -1371,12 +1332,11 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo @@ -1399,8 +1359,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s6, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s7, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s6, 2 @@ -1409,9 +1369,9 @@ ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s8, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 @@ -1423,7 +1383,6 @@ ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -1435,8 +1394,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s6, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1 -; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s7, s1, s0 ; GFX8-NEXT: s_cmp_eq_u32 s6, 2 @@ -1445,9 +1404,9 @@ ; GFX8-NEXT: s_cselect_b32 s7, s3, s7 ; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 4 -; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s8, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5 ; GFX8-NEXT: s_or_b32 s4, s5, s4 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 @@ -1459,7 +1418,6 @@ ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -1471,7 +1429,6 @@ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s6, s5, 1 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s7, s1, s0 ; GFX7-NEXT: s_cmp_eq_u32 s6, 2 @@ -1480,9 +1437,9 @@ ; GFX7-NEXT: s_cselect_b32 s7, s3, s7 ; GFX7-NEXT: s_and_b32 s5, s5, 1 ; GFX7-NEXT: s_lshl_b32 s5, s5, 4 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s8, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 @@ -1507,9 +1464,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s6, s5, 1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s7, s1, s0 @@ -1518,9 +1474,9 @@ ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_cselect_b32 s7, s3, s7 ; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s5, s5, 4 -; GFX10-NEXT: s_lshl_b32 s8, s8, s5 +; GFX10-NEXT: s_lshl_b32 s8, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 ; GFX10-NEXT: s_andn2_b32 s5, s7, s8 ; GFX10-NEXT: s_or_b32 s4, s5, s4 @@ -1548,17 +1504,16 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 1 ; GFX9-NEXT: s_lshr_b32 s4, s3, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -1578,14 +1533,13 @@ ; GFX8-LABEL: insertelement_v_v8i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s4, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_lshl_b32 s5, s2, s1 ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -1611,14 +1565,13 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s4, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_lshl_b32 s5, s2, s1 ; GFX7-NEXT: s_not_b32 s6, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -1646,9 +1599,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_lshl_b32 s3, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_and_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s5, s5, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 ; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1677,8 +1629,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s5, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2 @@ -1687,12 +1639,11 @@ ; GFX9-NEXT: s_cselect_b32 s6, s3, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s7, s7, s4 +; GFX9-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -1713,8 +1664,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s5, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s5, 1 -; GFX8-NEXT: s_mov_b32 s7, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s6, s1, s0 ; GFX8-NEXT: s_cmp_eq_u32 s5, 2 @@ -1724,7 +1675,7 @@ ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s7, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 @@ -1737,7 +1688,6 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -1749,8 +1699,8 @@ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s5, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s5, 1 -; GFX7-NEXT: s_mov_b32 s7, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s6, s1, s0 ; GFX7-NEXT: s_cmp_eq_u32 s5, 2 @@ -1760,11 +1710,10 @@ ; GFX7-NEXT: s_and_b32 s4, s4, 1 ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s7, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 ; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -1785,9 +1734,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: v_and_b32_e32 v4, s7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 @@ -1801,7 +1749,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, s4 +; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 @@ -1828,15 +1776,15 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -1865,15 +1813,15 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -1903,20 +1851,19 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -1939,15 +1886,14 @@ ; GFX10-LABEL: insertelement_s_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s4, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 @@ -2091,15 +2037,15 @@ ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2130,10 +2076,10 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -2159,10 +2105,10 @@ ; GFX8-LABEL: insertelement_v_v8i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -2192,15 +2138,14 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2221,15 +2166,14 @@ ; GFX10-LABEL: insertelement_v_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX10-NEXT: s_and_b32 s1, s2, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 @@ -2256,13 +2200,12 @@ ; GFX9-LABEL: insertelement_v_v8i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshr_b32 s4, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2284,13 +2227,12 @@ ; GFX8-LABEL: insertelement_v_v8i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s4, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2317,14 +2259,13 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s4, s2, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_not_b32 s5, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2352,9 +2293,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 ; GFX10-NEXT: s_lshl_b32 s2, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2469,15 +2409,14 @@ ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2505,8 +2444,8 @@ ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 @@ -2523,11 +2462,11 @@ ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s5, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s3, s4, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s16, s0, s3 +; GFX9-NEXT: s_or_b32 s16, s0, s2 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: s_cselect_b32 s0, s16, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 @@ -2544,7 +2483,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -2564,8 +2502,8 @@ ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s7, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 2 @@ -2582,11 +2520,11 @@ ; GFX8-NEXT: s_cselect_b32 s0, s15, s0 ; GFX8-NEXT: s_and_b32 s1, s5, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s3, s4, s2 -; GFX8-NEXT: s_lshl_b32 s3, s3, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s16, s0, s3 +; GFX8-NEXT: s_or_b32 s16, s0, s2 ; GFX8-NEXT: s_cmp_eq_u32 s7, 0 ; GFX8-NEXT: s_cselect_b32 s0, s16, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 @@ -2603,7 +2541,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -2623,7 +2560,6 @@ ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s7, s5, 1 ; GFX7-NEXT: s_cmp_eq_u32 s7, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s0, s9, s8 ; GFX7-NEXT: s_cmp_eq_u32 s7, 2 @@ -2640,11 +2576,11 @@ ; GFX7-NEXT: s_cselect_b32 s0, s15, s0 ; GFX7-NEXT: s_and_b32 s1, s5, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s3, s4, s2 -; GFX7-NEXT: s_lshl_b32 s3, s3, s1 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s16, s0, s3 +; GFX7-NEXT: s_or_b32 s16, s0, s2 ; GFX7-NEXT: s_cmp_eq_u32 s7, 0 ; GFX7-NEXT: s_cselect_b32 s0, s16, s8 ; GFX7-NEXT: s_cmp_eq_u32 s7, 1 @@ -2681,9 +2617,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s7, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -2702,11 +2637,11 @@ ; GFX10-NEXT: s_cmp_eq_u32 s7, 7 ; GFX10-NEXT: s_cselect_b32 s0, s15, s0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s16, s0, s1 ; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s16, s8 @@ -2746,17 +2681,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s13, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 @@ -2795,14 +2729,13 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s12, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s13, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX8-NEXT: s_lshl_b32 s13, s2, s1 ; GFX8-NEXT: s_not_b32 s14, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -2845,14 +2778,13 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s12, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s13, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX7-NEXT: s_lshl_b32 s13, s2, s1 ; GFX7-NEXT: s_not_b32 s14, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -2892,22 +2824,21 @@ ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX10-NEXT: s_lshr_b32 s7, s3, 1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_and_b32 s8, s2, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, 2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s7, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s7, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, 5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s7, 6 -; GFX10-NEXT: s_and_b32 s9, s2, s8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 ; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 4 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: s_lshl_b32 s8, s8, s3 -; GFX10-NEXT: s_lshl_b32 s3, s9, s3 -; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s3 +; GFX10-NEXT: s_lshl_b32 s3, s8, s3 +; GFX10-NEXT: s_not_b32 s8, s9 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2944,8 +2875,8 @@ ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s2, 2 @@ -2962,12 +2893,11 @@ ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 +; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s0, s0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -3003,8 +2933,8 @@ ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s2, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s2, 2 @@ -3022,7 +2952,7 @@ ; GFX8-NEXT: s_and_b32 s1, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 @@ -3051,7 +2981,6 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3062,8 +2991,8 @@ ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s2, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: s_mov_b32 s3, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s0, s9, s8 ; GFX7-NEXT: s_cmp_eq_u32 s2, 2 @@ -3081,11 +3010,10 @@ ; GFX7-NEXT: s_and_b32 s1, s4, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -3120,9 +3048,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s0, s4, 1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 1 -; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -3150,7 +3077,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: s_lshl_b32 s2, s2, 4 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s1, s1, s3 ; GFX10-NEXT: v_lshl_or_b32 v12, v8, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -3201,12 +3128,12 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, s22 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v9, s23 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 @@ -3261,12 +3188,12 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v7, s22 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 @@ -3322,17 +3249,16 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v7, s22 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3367,8 +3293,7 @@ ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_and_b32 s6, s4, s5 +; GFX10-NEXT: s_and_b32 s5, s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 @@ -3376,10 +3301,10 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 @@ -3606,21 +3531,21 @@ ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 @@ -3663,10 +3588,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -3711,10 +3636,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -3761,19 +3686,18 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 @@ -3809,9 +3733,9 @@ ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_and_b32 s5, s2, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v13, 16 -; GFX10-NEXT: s_and_b32 s6, s2, s5 +; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 @@ -3819,11 +3743,10 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, 0 -; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo @@ -3859,13 +3782,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s13, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -3906,13 +3828,12 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s12, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_not_b32 s13, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -3956,14 +3877,13 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s12, s2, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_not_b32 s13, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -4012,9 +3932,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s6, 6 ; GFX10-NEXT: s_lshl_b32 s7, s5, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s6, 7 -; GFX10-NEXT: s_mov_b32 s8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s7, s8, s7 +; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s7 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 ; GFX10-NEXT: s_not_b32 s7, s7 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -4197,19 +4116,18 @@ ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v14, 16 ; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -722,33 +722,34 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_and_b32 s3, s3, 3 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -757,13 +758,12 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s3, 3 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -775,7 +775,7 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -794,18 +794,17 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, s0, s1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_lshl_b32 s1, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -813,11 +812,11 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -832,28 +831,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX10-NEXT: s_and_b32 s0, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 -; GFX10-NEXT: s_lshl_b32 s3, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: s_lshl_b32 s2, 0xff, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -877,7 +875,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, s5 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -887,7 +885,7 @@ ; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: s_and_b32 s3, s4, 3 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: s_lshl_b32 s4, s5, s3 +; GFX9-NEXT: s_lshl_b32 s4, 0xff, s3 ; GFX9-NEXT: s_andn2_b32 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s3, v1 @@ -905,24 +903,23 @@ ; GFX8-LABEL: insertelement_s_v4i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s3, s0, s1 -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s1, s0, 24 +; GFX8-NEXT: s_and_b32 s2, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s3, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, 3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -946,7 +943,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_and_b32 s3, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s3, s3, s5 @@ -957,7 +954,7 @@ ; GFX7-NEXT: s_and_b32 s1, s4, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -979,30 +976,29 @@ ; GFX10-LABEL: insertelement_s_v4i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 24 -; GFX10-NEXT: s_and_b32 s4, s0, s2 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 24 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_lshl_b32 s4, s2, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s3 -; GFX10-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, s1, s0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1019,14 +1015,14 @@ ; GFX9-LABEL: insertelement_s_v4i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, s5 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -1034,7 +1030,7 @@ ; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s3 -; GFX9-NEXT: s_and_b32 s3, s4, s5 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1054,14 +1050,14 @@ ; GFX8-LABEL: insertelement_s_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s3, s0, s1 +; GFX8-NEXT: s_and_b32 s3, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 @@ -1069,7 +1065,7 @@ ; GFX8-NEXT: s_or_b32 s0, s3, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1091,13 +1087,13 @@ ; GFX7-LABEL: insertelement_s_v4i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_and_b32 s3, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s3, s3, s5 @@ -1105,7 +1101,7 @@ ; GFX7-NEXT: s_or_b32 s0, s3, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s4, s2 +; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1131,30 +1127,29 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s4, s1 +; GFX10-NEXT: s_and_b32 s1, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-NEXT: s_and_b32 s3, s0, s1 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1171,14 +1166,14 @@ ; GFX9-LABEL: insertelement_s_v4i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s5, s0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -1205,14 +1200,14 @@ ; GFX8-LABEL: insertelement_s_v4i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s3, s0, s1 +; GFX8-NEXT: s_and_b32 s3, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s4 @@ -1248,7 +1243,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_and_b32 s3, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s3, s3, s4 @@ -1281,29 +1276,28 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-NEXT: s_and_b32 s3, s0, s1 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1321,62 +1315,63 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v5, v2, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 @@ -1392,37 +1387,37 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_and_b32 s0, s2, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1431,17 +1426,16 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: s_and_b32 s0, s2, s1 +; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v5, v2 @@ -1451,7 +1445,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1474,40 +1468,40 @@ ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 @@ -1544,7 +1538,7 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX7-NEXT: s_lshl_b32 s1, s0, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1562,7 +1556,7 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -1577,27 +1571,26 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_and_b32 s2, s2, 3 +; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: s_lshl_b32 s1, s2, 3 -; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: s_mov_b32 s0, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s1, s0, s1 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshl_b32 s0, s1, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1725,9 +1718,8 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1735,7 +1727,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 @@ -1761,28 +1753,25 @@ ; GFX9-LABEL: insertelement_s_v8i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s8, 0x80008 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s9, s0, s8 -; GFX9-NEXT: s_and_b32 s7, s0, s6 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_mov_b32 s9, 0x80010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s1, s6 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 -; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_lshl_b32 s2, s3, 24 @@ -1792,30 +1781,30 @@ ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s5, s5, 3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3 -; GFX9-NEXT: s_and_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s6, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_cmp_eq_u32 s2, 0 ; GFX9-NEXT: s_cselect_b32 s0, s3, s0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s0, s8 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 -; GFX9-NEXT: s_and_b32 s4, s0, s6 +; GFX9-NEXT: s_and_b32 s4, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s4, s0 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_bfe_u32 s4, s1, s8 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s1, s6 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 @@ -1829,28 +1818,25 @@ ; GFX8-LABEL: insertelement_s_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s8, 0x80008 -; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s9, s0, s8 -; GFX8-NEXT: s_and_b32 s7, s0, s6 -; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_or_b32 s7, s7, s9 -; GFX8-NEXT: s_mov_b32 s9, 0x80010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 +; GFX8-NEXT: s_and_b32 s6, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s7, s0 +; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s7, s1, s8 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s6 -; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 -; GFX8-NEXT: s_or_b32 s2, s2, s7 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 @@ -1860,30 +1846,30 @@ ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s5, s5, 3 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3 -; GFX8-NEXT: s_and_b32 s4, s4, s6 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s6, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX8-NEXT: s_andn2_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_cmp_eq_u32 s2, 0 ; GFX8-NEXT: s_cselect_b32 s0, s3, s0 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 ; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s0, s8 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s4, s0, s6 +; GFX8-NEXT: s_and_b32 s4, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s4, s1, s8 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s6 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -1897,26 +1883,23 @@ ; GFX7-LABEL: insertelement_s_v8i8_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s8, 0x80008 -; GFX7-NEXT: s_movk_i32 s6, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s0, s8 -; GFX7-NEXT: s_and_b32 s7, s0, s6 -; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_or_b32 s7, s7, s9 -; GFX7-NEXT: s_mov_b32 s9, 0x80010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s9 +; GFX7-NEXT: s_and_b32 s6, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s7, s0 +; GFX7-NEXT: s_or_b32 s0, s6, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s7, s1, s8 +; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s6 -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s9 -; GFX7-NEXT: s_or_b32 s2, s2, s7 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 @@ -1926,30 +1909,30 @@ ; GFX7-NEXT: s_cselect_b32 s3, s1, s0 ; GFX7-NEXT: s_and_b32 s5, s5, 3 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3 -; GFX7-NEXT: s_and_b32 s4, s4, s6 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s6, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX7-NEXT: s_andn2_b32 s3, s3, s5 ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_cmp_eq_u32 s2, 0 ; GFX7-NEXT: s_cselect_b32 s4, s3, s0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: s_cselect_b32 s3, s3, s1 -; GFX7-NEXT: s_bfe_u32 s10, s4, s8 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s4, 24 -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-NEXT: s_bfe_u32 s4, s4, s9 -; GFX7-NEXT: s_or_b32 s7, s7, s10 +; GFX7-NEXT: s_and_b32 s6, s4, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 -; GFX7-NEXT: s_or_b32 s4, s7, s4 +; GFX7-NEXT: s_or_b32 s4, s6, s4 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s2, s4, s2 -; GFX7-NEXT: s_and_b32 s4, s3, s6 -; GFX7-NEXT: s_bfe_u32 s6, s3, s8 +; GFX7-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s3, 24 +; GFX7-NEXT: s_or_b32 s2, s4, s2 +; GFX7-NEXT: s_and_b32 s4, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s9 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s4, s4, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s4, s3 @@ -1966,68 +1949,65 @@ ; GFX10-LABEL: insertelement_s_v8i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s6, 0x80010 -; GFX10-NEXT: s_lshr_b32 s7, s5, 2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s11, s0, s3 -; GFX10-NEXT: s_bfe_u32 s13, s1, s3 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s10, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s12, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s11, s11, 8 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s10, s10, s11 -; GFX10-NEXT: s_or_b32 s11, s12, s13 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 -; GFX10-NEXT: s_or_b32 s1, s11, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: s_cselect_b32 s8, s1, s0 +; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_or_b32 s8, s9, s10 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s5, s5, 3 -; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s9, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, 0xff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s8, s9 -; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_bfe_u32 s7, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_and_b32 s5, s0, s2 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s0, s3, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s7 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s3 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s4, s7, 8 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s3, s6, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s3, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -2045,47 +2025,48 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v7, v8, s3, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v6, v7, s3, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v6, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2094,15 +2075,14 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: s_lshr_b32 s1, s3, 2 -; GFX8-NEXT: s_and_b32 s3, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s3, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: s_lshr_b32 s0, s3, 2 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -2123,9 +2103,9 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -2151,63 +2131,64 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 -; GFX7-NEXT: s_lshr_b32 s0, s3, 2 -; GFX7-NEXT: s_and_b32 s2, s2, s6 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, s6, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s3, 2 +; GFX7-NEXT: s_and_b32 s3, s3, 3 +; GFX7-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s3, s3, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_lshl_b32 s3, 0xff, s3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s3, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v3, s2, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -2216,8 +2197,7 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -2225,9 +2205,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 @@ -2236,7 +2216,7 @@ ; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 -; GFX10-NEXT: s_lshl_b32 s3, s4, s1 +; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo ; GFX10-NEXT: s_not_b32 s2, s3 @@ -2251,9 +2231,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, s4, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2271,28 +2251,27 @@ ; GFX9-LABEL: insertelement_s_v8i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s9, 0x80008 -; GFX9-NEXT: s_movk_i32 s7, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: s_mov_b32 s10, 0x80010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s10 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s8, s0 +; GFX9-NEXT: s_or_b32 s0, s7, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s1, s7 -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 -; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 ; GFX9-NEXT: s_lshl_b32 s5, s6, 24 @@ -2302,59 +2281,55 @@ ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s8, s7, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s8 +; GFX9-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: s_mov_b32 s3, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v0, v2, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v2, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s7, 0x80008 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s0, s7 -; GFX8-NEXT: s_and_b32 s6, s0, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_mov_b32 s8, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s6, s0 +; GFX8-NEXT: s_or_b32 s0, s5, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 -; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 @@ -2365,7 +2340,7 @@ ; GFX8-NEXT: s_and_b32 s4, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 @@ -2397,27 +2372,25 @@ ; GFX7-LABEL: insertelement_s_v8i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s7, 0x80008 -; GFX7-NEXT: s_movk_i32 s5, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s8, s0, s7 -; GFX7-NEXT: s_and_b32 s6, s0, s5 -; GFX7-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-NEXT: s_or_b32 s6, s6, s8 -; GFX7-NEXT: s_mov_b32 s8, 0x80010 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s6, s0 +; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s5 -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 -; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 @@ -2428,34 +2401,34 @@ ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v3, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_bfe_u32 v3, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2466,57 +2439,54 @@ ; GFX10-LABEL: insertelement_s_v8i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_lshr_b32 s6, s4, 2 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX10-NEXT: s_lshr_b32 s2, s4, 2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s10, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s11, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s10, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s8, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 -; GFX10-NEXT: s_or_b32 s3, s11, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s8 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s8, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_lshl_b32 s5, s2, s4 +; GFX10-NEXT: s_lshl_b32 s5, 0xff, s4 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 @@ -2533,27 +2503,26 @@ ; GFX9-LABEL: insertelement_s_v8i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s9, 0x80008 -; GFX9-NEXT: s_movk_i32 s7, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s7, 0xff +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: s_mov_b32 s10, 0x80010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s10 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s1, s7 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 @@ -2561,9 +2530,8 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s7 @@ -2575,16 +2543,17 @@ ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 +; GFX9-NEXT: v_and_or_b32 v5, v0, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2596,27 +2565,26 @@ ; GFX8-LABEL: insertelement_s_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s7, 0x80008 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s5, 0xff +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s0, s7 -; GFX8-NEXT: s_and_b32 s6, s0, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_mov_b32 s8, 0x80010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s6, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s5 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -2624,9 +2592,8 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_and_b32 s2, s4, s5 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 @@ -2662,27 +2629,26 @@ ; GFX7-LABEL: insertelement_s_v8i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s7, 0x80008 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s8, s0, s7 -; GFX7-NEXT: s_and_b32 s6, s0, s5 -; GFX7-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-NEXT: s_or_b32 s6, s6, s8 -; GFX7-NEXT: s_mov_b32 s8, 0x80010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s6, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s6, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 +; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s5 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -2690,9 +2656,8 @@ ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s2, s4, s5 +; GFX7-NEXT: s_and_b32 s2, s4, 0xff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 @@ -2716,7 +2681,7 @@ ; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2734,38 +2699,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_and_b32 s4, s4, s2 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, 0xff ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_lshl_b32 s5, s8, 8 -; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s4, s7, s5 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -2781,9 +2743,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2801,33 +2763,31 @@ ; GFX9-LABEL: insertelement_s_v8i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s8, 0x80008 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s9, s0, s8 -; GFX9-NEXT: s_and_b32 s7, s0, s6 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_mov_b32 s9, 0x80010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s7, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s1, s6 +; GFX9-NEXT: s_and_b32 s4, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -2842,16 +2802,17 @@ ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s6, v4 +; GFX9-NEXT: v_and_or_b32 v5, v0, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2863,33 +2824,31 @@ ; GFX8-LABEL: insertelement_s_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s6, 0x80008 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s0, s6 -; GFX8-NEXT: s_and_b32 s5, s0, s4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_or_b32 s5, s5, s7 -; GFX8-NEXT: s_mov_b32 s7, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s5, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s5, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s5, s1, s6 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s4 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s7 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -2928,33 +2887,31 @@ ; GFX7-LABEL: insertelement_s_v8i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s6, 0x80008 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s7, s0, s6 -; GFX7-NEXT: s_and_b32 s5, s0, s4 -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_or_b32 s5, s5, s7 -; GFX7-NEXT: s_mov_b32 s7, 0x80010 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s7 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s5, s1, s6 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s4 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s7 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 @@ -2982,7 +2939,7 @@ ; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2999,38 +2956,35 @@ ; GFX10-LABEL: insertelement_s_v8i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s4, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, 0xff ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, 8 -; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s4, s7, s4 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s3, s5, 24 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -3046,9 +3000,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3069,102 +3023,103 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_lshlrev_b32_e64 v9, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v0, v0, v12, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v10 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v10, v2, v9 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_lshlrev_b32_e64 v11, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -3176,63 +3131,64 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s3, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_and_b32 s0, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, s3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, s1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v9, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3242,7 +3198,6 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 @@ -3253,12 +3208,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v7 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, s3 -; GFX10-NEXT: s_and_b32 s0, s2, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, 0xff +; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX10-NEXT: v_or3_b32 v1, v1, v9, v5 @@ -3276,9 +3231,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v8, v1, s3, v2 +; GFX10-NEXT: v_and_or_b32 v8, 0xff, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3303,60 +3258,60 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, s2, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v8, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s2, 2 -; GFX8-NEXT: s_and_b32 s2, s2, 3 +; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v7, 8 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 @@ -3377,9 +3332,9 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3405,63 +3360,64 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s3, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 3 -; GFX7-NEXT: s_lshr_b32 s0, s2, 2 -; GFX7-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: s_lshl_b32 s2, s2, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_lshl_b32 s2, 0xff, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s2, s2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s3, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3470,7 +3426,6 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3478,9 +3433,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: s_lshr_b32 s1, s2, 2 ; GFX10-NEXT: s_and_b32 s0, s2, 3 @@ -3489,7 +3444,7 @@ ; GFX10-NEXT: v_or3_b32 v1, v1, v8, v4 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 @@ -3505,8 +3460,8 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v2, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v1, s3, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3700,48 +3655,46 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, v4, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xff ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v9, v5 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 -; GFX10-NEXT: v_or3_b32 v1, v1, v11, v7 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2 +; GFX10-NEXT: v_or3_b32 v1, v1, v10, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v8, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v9, v5 +; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3754,47 +3707,44 @@ ; GFX9-LABEL: insertelement_s_v16i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s12, 0x80008 -; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s13, s0, s12 -; GFX9-NEXT: s_and_b32 s11, s0, s10 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 -; GFX9-NEXT: s_or_b32 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s13, 0x80010 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s10, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s11, s0 +; GFX9-NEXT: s_or_b32 s0, s10, s0 ; GFX9-NEXT: s_lshl_b32 s6, s6, 24 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s1, s10 -; GFX9-NEXT: s_lshl_b32 s11, s11, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 -; GFX9-NEXT: s_or_b32 s6, s6, s11 +; GFX9-NEXT: s_and_b32 s6, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s6, s6, s10 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s6, s1 ; GFX9-NEXT: s_lshl_b32 s6, s7, 24 -; GFX9-NEXT: s_bfe_u32 s7, s2, s12 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s8, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s2, s10 +; GFX9-NEXT: s_and_b32 s6, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s6, s2 ; GFX9-NEXT: s_lshl_b32 s6, s8, 24 -; GFX9-NEXT: s_bfe_u32 s7, s3, s12 +; GFX9-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s3, s10 +; GFX9-NEXT: s_and_b32 s6, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s6, s3 @@ -3809,9 +3759,9 @@ ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3 -; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s10, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 @@ -3822,41 +3772,41 @@ ; GFX9-NEXT: s_cselect_b32 s2, s4, s2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_bfe_u32 s9, s0, s12 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_and_b32 s8, s0, s10 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s12 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s1, s10 +; GFX9-NEXT: s_and_b32 s4, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 -; GFX9-NEXT: s_bfe_u32 s5, s2, s12 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s4, s2, s10 +; GFX9-NEXT: s_and_b32 s4, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s4, s2 ; GFX9-NEXT: s_lshl_b32 s4, s6, 24 -; GFX9-NEXT: s_bfe_u32 s5, s3, s12 +; GFX9-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s3, s10 +; GFX9-NEXT: s_and_b32 s4, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s4, s3 @@ -3872,47 +3822,44 @@ ; GFX8-LABEL: insertelement_s_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s12, 0x80008 -; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s13, s0, s12 -; GFX8-NEXT: s_and_b32 s11, s0, s10 -; GFX8-NEXT: s_lshl_b32 s13, s13, 8 -; GFX8-NEXT: s_or_b32 s11, s11, s13 -; GFX8-NEXT: s_mov_b32 s13, 0x80010 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s10, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s11, s11, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s10, s10, s11 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s11, s0 +; GFX8-NEXT: s_or_b32 s0, s10, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 24 -; GFX8-NEXT: s_bfe_u32 s11, s1, s12 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s1, s10 -; GFX8-NEXT: s_lshl_b32 s11, s11, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 -; GFX8-NEXT: s_or_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s6, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s10 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s6, s1 ; GFX8-NEXT: s_lshl_b32 s6, s7, 24 -; GFX8-NEXT: s_bfe_u32 s7, s2, s12 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s6, s2, s10 +; GFX8-NEXT: s_and_b32 s6, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s6, s2 ; GFX8-NEXT: s_lshl_b32 s6, s8, 24 -; GFX8-NEXT: s_bfe_u32 s7, s3, s12 +; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s9, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s3, s10 +; GFX8-NEXT: s_and_b32 s6, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s6, s3 @@ -3927,9 +3874,9 @@ ; GFX8-NEXT: s_cselect_b32 s7, s3, s7 ; GFX8-NEXT: s_and_b32 s5, s5, 3 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3 -; GFX8-NEXT: s_and_b32 s4, s4, s10 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s10, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5 ; GFX8-NEXT: s_or_b32 s4, s5, s4 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 @@ -3940,41 +3887,41 @@ ; GFX8-NEXT: s_cselect_b32 s2, s4, s2 ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_bfe_u32 s9, s0, s12 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_and_b32 s8, s0, s10 +; GFX8-NEXT: s_and_b32 s8, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_bfe_u32 s8, s1, s12 +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s1, s10 +; GFX8-NEXT: s_and_b32 s4, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s5, 24 -; GFX8-NEXT: s_bfe_u32 s5, s2, s12 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s2, s10 +; GFX8-NEXT: s_and_b32 s4, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s6, 24 -; GFX8-NEXT: s_bfe_u32 s5, s3, s12 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, s3, s10 +; GFX8-NEXT: s_and_b32 s4, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s4, s3 @@ -3990,45 +3937,42 @@ ; GFX7-LABEL: insertelement_s_v16i8_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s12, 0x80008 -; GFX7-NEXT: s_movk_i32 s10, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s13, s0, s12 -; GFX7-NEXT: s_and_b32 s11, s0, s10 -; GFX7-NEXT: s_lshl_b32 s13, s13, 8 -; GFX7-NEXT: s_or_b32 s11, s11, s13 -; GFX7-NEXT: s_mov_b32 s13, 0x80010 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s13 +; GFX7-NEXT: s_and_b32 s10, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s10, s10, s11 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s11, s0 +; GFX7-NEXT: s_or_b32 s0, s10, s0 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_bfe_u32 s11, s1, s12 +; GFX7-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s7, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s6 -; GFX7-NEXT: s_and_b32 s6, s1, s10 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s13 -; GFX7-NEXT: s_or_b32 s6, s6, s11 +; GFX7-NEXT: s_and_b32 s6, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s10 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s6, s1 ; GFX7-NEXT: s_lshl_b32 s6, s7, 24 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 +; GFX7-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s6 -; GFX7-NEXT: s_and_b32 s6, s2, s10 +; GFX7-NEXT: s_and_b32 s6, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 +; GFX7-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s3, s10 +; GFX7-NEXT: s_and_b32 s6, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s6, s3 @@ -4043,9 +3987,9 @@ ; GFX7-NEXT: s_cselect_b32 s7, s3, s7 ; GFX7-NEXT: s_and_b32 s5, s5, 3 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3 -; GFX7-NEXT: s_and_b32 s4, s4, s10 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s10, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 @@ -4056,41 +4000,41 @@ ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: s_bfe_u32 s14, s5, s12 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s5, 24 -; GFX7-NEXT: s_and_b32 s11, s5, s10 -; GFX7-NEXT: s_lshl_b32 s14, s14, 8 -; GFX7-NEXT: s_bfe_u32 s5, s5, s13 -; GFX7-NEXT: s_or_b32 s11, s11, s14 +; GFX7-NEXT: s_and_b32 s10, s5, 0xff +; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-NEXT: s_or_b32 s10, s10, s11 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 -; GFX7-NEXT: s_or_b32 s5, s11, s5 +; GFX7-NEXT: s_or_b32 s5, s10, s5 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s11, s7, s12 +; GFX7-NEXT: s_bfe_u32 s10, s7, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s7, 24 ; GFX7-NEXT: s_or_b32 s4, s5, s4 -; GFX7-NEXT: s_and_b32 s5, s7, s10 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_bfe_u32 s7, s7, s13 -; GFX7-NEXT: s_or_b32 s5, s5, s11 +; GFX7-NEXT: s_and_b32 s5, s7, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s10 ; GFX7-NEXT: s_lshl_b32 s7, s7, 16 ; GFX7-NEXT: s_or_b32 s5, s5, s7 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 +; GFX7-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 ; GFX7-NEXT: s_or_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s6, s2, s10 +; GFX7-NEXT: s_and_b32 s6, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 +; GFX7-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 ; GFX7-NEXT: s_or_b32 s6, s2, s6 -; GFX7-NEXT: s_and_b32 s2, s3, s10 +; GFX7-NEXT: s_and_b32 s2, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -4109,112 +4053,109 @@ ; GFX10-LABEL: insertelement_s_v16i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x80008 -; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: s_mov_b32 s8, 0x80010 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s14, s0, s7 -; GFX10-NEXT: s_lshr_b32 s9, s0, 24 -; GFX10-NEXT: s_and_b32 s13, s0, s6 -; GFX10-NEXT: s_bfe_u32 s0, s0, s8 -; GFX10-NEXT: s_bfe_u32 s16, s1, s7 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_lshr_b32 s10, s1, 24 -; GFX10-NEXT: s_and_b32 s15, s1, s6 -; GFX10-NEXT: s_bfe_u32 s1, s1, s8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_or_b32 s13, s13, s14 -; GFX10-NEXT: s_bfe_u32 s18, s2, s7 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_bfe_u32 s11, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s13, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s7, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s0, 0xff +; GFX10-NEXT: s_and_b32 s12, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s14, s15, s16 -; GFX10-NEXT: s_or_b32 s0, s13, s0 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 24 -; GFX10-NEXT: s_or_b32 s1, s14, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s9, s18, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s9, s17, s9 +; GFX10-NEXT: s_or_b32 s10, s10, s11 +; GFX10-NEXT: s_or_b32 s11, s12, s13 +; GFX10-NEXT: s_bfe_u32 s15, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s7, s7, 24 +; GFX10-NEXT: s_or_b32 s1, s11, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 24 +; GFX10-NEXT: s_and_b32 s14, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_lshl_b32 s15, s15, 8 +; GFX10-NEXT: s_or_b32 s0, s10, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s9, s3, 24 +; GFX10-NEXT: s_or_b32 s12, s14, s15 +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s1, s1, s10 -; GFX10-NEXT: s_bfe_u32 s10, s3, s7 -; GFX10-NEXT: s_lshr_b32 s12, s3, 24 -; GFX10-NEXT: s_or_b32 s2, s9, s2 -; GFX10-NEXT: s_lshl_b32 s9, s11, 24 -; GFX10-NEXT: s_and_b32 s11, s3, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s8 -; GFX10-NEXT: s_or_b32 s10, s11, s10 +; GFX10-NEXT: s_lshl_b32 s6, s8, 24 +; GFX10-NEXT: s_and_b32 s8, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s12, s2 +; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s9 -; GFX10-NEXT: s_or_b32 s3, s10, s3 -; GFX10-NEXT: s_lshl_b32 s9, s12, 24 -; GFX10-NEXT: s_lshr_b32 s10, s5, 2 -; GFX10-NEXT: s_or_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_eq_u32 s10, 1 -; GFX10-NEXT: s_cselect_b32 s9, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 2 -; GFX10-NEXT: s_cselect_b32 s9, s2, s9 -; GFX10-NEXT: s_cmp_eq_u32 s10, 3 -; GFX10-NEXT: s_cselect_b32 s9, s3, s9 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s7, s3 +; GFX10-NEXT: s_lshl_b32 s6, s9, 24 +; GFX10-NEXT: s_lshr_b32 s7, s5, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: s_cselect_b32 s6, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s6, s2, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s5, s5, 3 -; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s11, s6, s5 +; GFX10-NEXT: s_lshl_b32 s8, 0xff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s9, s11 +; GFX10-NEXT: s_andn2_b32 s5, s6, s8 ; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 1 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_cmp_eq_u32 s10, 2 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_eq_u32 s10, 3 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 ; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_bfe_u32 s10, s0, s7 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s10, s11, s10 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 -; GFX10-NEXT: s_bfe_u32 s10, s1, s7 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 24 -; GFX10-NEXT: s_and_b32 s12, s1, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s8 -; GFX10-NEXT: s_or_b32 s10, s12, s10 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s7, s9, s7 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s4, s5, 24 -; GFX10-NEXT: s_bfe_u32 s5, s2, s7 -; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_or_b32 s1, s10, s1 -; GFX10-NEXT: s_and_b32 s10, s2, s6 +; GFX10-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_and_b32 s7, s2, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_or_b32 s5, s7, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_bfe_u32 s4, s3, s7 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 +; GFX10-NEXT: s_bfe_u32 s4, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 ; GFX10-NEXT: s_or_b32 s2, s5, s2 -; GFX10-NEXT: s_and_b32 s5, s3, s6 +; GFX10-NEXT: s_and_b32 s5, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX10-NEXT: s_or_b32 s4, s5, s4 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s5, s9, 24 +; GFX10-NEXT: s_lshl_b32 s5, s6, 24 ; GFX10-NEXT: s_or_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s11, 24 +; GFX10-NEXT: s_lshl_b32 s4, s8, 24 ; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -4235,19 +4176,18 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NEXT: v_mov_b32_e32 v7, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 2 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s6, s3 -; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 +; GFX9-NEXT: s_not_b32 s6, s3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4258,57 +4198,59 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v13 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v17 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 ; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v3, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v13, v3, v6, v19 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 2 +; GFX9-NEXT: v_mov_b32_e32 v15, s2 ; GFX9-NEXT: v_or3_b32 v3, v13, v3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, 3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v8, v9, s5, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v14 +; GFX9-NEXT: v_and_or_b32 v9, v9, s6, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v16 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v0, v13, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 -; GFX9-NEXT: v_or3_b32 v2, v2, v17, v10 -; GFX9-NEXT: v_or3_b32 v3, v3, v7, v6 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v3, v3, v6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v3, v8, v6 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -4319,14 +4261,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v9, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_lshl_b32 s5, s2, s1 ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -4408,108 +4349,109 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff +; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 -; GFX7-NEXT: s_and_b32 s1, s2, s6 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s5, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s7, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: s_not_b32 s6, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v14, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v15, v3, v4 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s7, v4 -; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, s5, v5 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -4520,11 +4462,10 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_mov_b32_e32 v5, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 2 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 2 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -4534,34 +4475,34 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v16 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v9 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 ; GFX10-NEXT: v_or3_b32 v3, v3, v10, v6 ; GFX10-NEXT: s_lshl_b32 s3, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v2, s0 -; GFX10-NEXT: s_lshl_b32 s6, s4, s3 +; GFX10-NEXT: s_lshl_b32 s5, 0xff, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v3, s1 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 @@ -4578,13 +4519,13 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -4604,47 +4545,46 @@ ; GFX9-LABEL: insertelement_s_v16i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s13, 0x80008 -; GFX9-NEXT: s_movk_i32 s11, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s11, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_mov_b32 s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s14, s0, s13 -; GFX9-NEXT: s_and_b32 s12, s0, s11 -; GFX9-NEXT: s_lshl_b32 s14, s14, 8 -; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: s_mov_b32 s14, 0x80010 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s14 +; GFX9-NEXT: s_and_b32 s11, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s12, s0 +; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s7, s7, 24 -; GFX9-NEXT: s_bfe_u32 s12, s1, s13 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s1, s11 -; GFX9-NEXT: s_lshl_b32 s12, s12, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 -; GFX9-NEXT: s_or_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s7, s1 ; GFX9-NEXT: s_lshl_b32 s7, s8, 24 -; GFX9-NEXT: s_bfe_u32 s8, s2, s13 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s7 -; GFX9-NEXT: s_and_b32 s7, s2, s11 +; GFX9-NEXT: s_and_b32 s7, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s14 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s7, s2 ; GFX9-NEXT: s_lshl_b32 s7, s9, 24 -; GFX9-NEXT: s_bfe_u32 s8, s3, s13 +; GFX9-NEXT: s_bfe_u32 s8, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s10, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s3, s11 +; GFX9-NEXT: s_and_b32 s7, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s14 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s7, s3 @@ -4659,48 +4599,47 @@ ; GFX9-NEXT: s_cselect_b32 s8, s3, s8 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s9, s11, s4 +; GFX9-NEXT: s_lshl_b32 s9, 0xff, s4 ; GFX9-NEXT: s_andn2_b32 s8, s8, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 3 -; GFX9-NEXT: s_mov_b32 s6, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v8, v0, s11, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v9, v0, v4, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v4, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -4710,47 +4649,44 @@ ; GFX8-LABEL: insertelement_s_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s11, 0x80008 -; GFX8-NEXT: s_movk_i32 s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s12, s0, s11 -; GFX8-NEXT: s_and_b32 s10, s0, s9 -; GFX8-NEXT: s_lshl_b32 s12, s12, 8 -; GFX8-NEXT: s_or_b32 s10, s10, s12 -; GFX8-NEXT: s_mov_b32 s12, 0x80010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s9, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s10, s0 +; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 -; GFX8-NEXT: s_bfe_u32 s10, s1, s11 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s1, s9 -; GFX8-NEXT: s_lshl_b32 s10, s10, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s12 -; GFX8-NEXT: s_or_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s5, s1 ; GFX8-NEXT: s_lshl_b32 s5, s6, 24 -; GFX8-NEXT: s_bfe_u32 s6, s2, s11 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s5 -; GFX8-NEXT: s_and_b32 s5, s2, s9 +; GFX8-NEXT: s_and_b32 s5, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s12 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s5, s2 ; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_bfe_u32 s6, s3, s11 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s8, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s5 -; GFX8-NEXT: s_and_b32 s5, s3, s9 +; GFX8-NEXT: s_and_b32 s5, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s12 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s5, s3 @@ -4766,7 +4702,7 @@ ; GFX8-NEXT: s_and_b32 s4, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s9, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 @@ -4820,46 +4756,44 @@ ; GFX7-LABEL: insertelement_s_v16i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s11, 0x80008 -; GFX7-NEXT: s_movk_i32 s9, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s12, s0, s11 -; GFX7-NEXT: s_and_b32 s10, s0, s9 -; GFX7-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-NEXT: s_or_b32 s10, s10, s12 -; GFX7-NEXT: s_mov_b32 s12, 0x80010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s12 +; GFX7-NEXT: s_and_b32 s9, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s9, s9, s10 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s10, s0 +; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 -; GFX7-NEXT: s_bfe_u32 s10, s1, s11 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s1, s9 -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s12 -; GFX7-NEXT: s_or_b32 s5, s5, s10 +; GFX7-NEXT: s_and_b32 s5, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s5, s1 ; GFX7-NEXT: s_lshl_b32 s5, s6, 24 -; GFX7-NEXT: s_bfe_u32 s6, s2, s11 +; GFX7-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s5 -; GFX7-NEXT: s_and_b32 s5, s2, s9 +; GFX7-NEXT: s_and_b32 s5, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s12 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s5, s2 ; GFX7-NEXT: s_lshl_b32 s5, s7, 24 -; GFX7-NEXT: s_bfe_u32 s6, s3, s11 +; GFX7-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s3, 24 ; GFX7-NEXT: s_or_b32 s2, s2, s5 -; GFX7-NEXT: s_and_b32 s5, s3, s9 +; GFX7-NEXT: s_and_b32 s5, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s12 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s5, s3 @@ -4875,60 +4809,60 @@ ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s9, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s9, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4939,81 +4873,78 @@ ; GFX10-LABEL: insertelement_s_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s9, s0, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_lshl_b32 s8, s17, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_and_b32 s9, s3, s5 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 -; GFX10-NEXT: s_or_b32 s8, s16, s8 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_or_b32 s10, s11, s12 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s10, s1 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s9, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 +; GFX10-NEXT: s_or_b32 s11, s13, s14 +; GFX10-NEXT: s_or_b32 s0, s0, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s6, s9, s6 +; GFX10-NEXT: s_lshl_b32 s5, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s11, s2 +; GFX10-NEXT: s_or_b32 s6, s7, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_or_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s8, s10, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s3, s6, s3 -; GFX10-NEXT: s_lshl_b32 s6, s11, 24 -; GFX10-NEXT: s_lshr_b32 s7, s4, 2 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s3, s3, s6 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 0 -; GFX10-NEXT: s_cselect_b32 s6, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s7, 2 -; GFX10-NEXT: s_cselect_b32 s6, s2, s6 -; GFX10-NEXT: s_cmp_eq_u32 s7, 3 -; GFX10-NEXT: s_cselect_b32 s6, s3, s6 +; GFX10-NEXT: s_lshl_b32 s5, s8, 24 +; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX10-NEXT: s_cselect_b32 s5, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b32 s5, s2, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b32 s5, s3, s5 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s8, s5, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, s8 -; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 +; GFX10-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX10-NEXT: s_andn2_b32 s5, s5, s7 +; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 2 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 @@ -5021,15 +4952,15 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5050,60 +4981,58 @@ ; GFX9-LABEL: insertelement_s_v16i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s13, 0x80008 -; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s14, s0, s13 -; GFX9-NEXT: s_and_b32 s8, s0, s12 -; GFX9-NEXT: s_lshl_b32 s14, s14, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s14 -; GFX9-NEXT: s_mov_b32 s14, 0x80010 +; GFX9-NEXT: s_bfe_u32 s13, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s14 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s8, s8, s13 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 ; GFX9-NEXT: s_or_b32 s8, s0, s5 -; GFX9-NEXT: s_bfe_u32 s5, s1, s13 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_and_b32 s0, s1, s12 +; GFX9-NEXT: s_and_b32 s0, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s0, s0, s5 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s9, 24 ; GFX9-NEXT: s_or_b32 s9, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s13 -; GFX9-NEXT: s_and_b32 s0, s2, s12 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s14 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s10, 24 ; GFX9-NEXT: s_or_b32 s10, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s13 -; GFX9-NEXT: s_and_b32 s0, s3, s12 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s14 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX9-NEXT: s_lshr_b32 s11, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s11, 24 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_or_b32 s11, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s12 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -5111,43 +5040,44 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: s_mov_b32 s7, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v8, v0, s12, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_and_or_b32 v9, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s12, v4 +; GFX9-NEXT: v_and_or_b32 v4, v1, v5, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v5, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -5157,60 +5087,58 @@ ; GFX8-LABEL: insertelement_s_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s13, 0x80008 -; GFX8-NEXT: s_movk_i32 s12, 0xff -; GFX8-NEXT: s_mov_b32 s14, 0x80010 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s12, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s9, s0, s13 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_and_b32 s8, s0, s12 +; GFX8-NEXT: s_and_b32 s8, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s14 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 ; GFX8-NEXT: s_or_b32 s8, s0, s5 -; GFX8-NEXT: s_bfe_u32 s5, s1, s13 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 -; GFX8-NEXT: s_and_b32 s0, s1, s12 +; GFX8-NEXT: s_and_b32 s0, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s14 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s9, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s13 -; GFX8-NEXT: s_and_b32 s0, s2, s12 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s14 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: s_or_b32 s10, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s13 -; GFX8-NEXT: s_and_b32 s0, s3, s12 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s14 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX8-NEXT: s_lshr_b32 s11, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s11, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_or_b32 s11, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s12 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -5269,117 +5197,115 @@ ; GFX7-LABEL: insertelement_s_v16i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s13, 0x80008 -; GFX7-NEXT: s_movk_i32 s12, 0xff -; GFX7-NEXT: s_mov_b32 s14, 0x80010 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s0, s13 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 -; GFX7-NEXT: s_and_b32 s8, s0, s12 +; GFX7-NEXT: s_and_b32 s8, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, s14 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s8, s8, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s8, s0 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 ; GFX7-NEXT: s_or_b32 s8, s0, s5 -; GFX7-NEXT: s_bfe_u32 s5, s1, s13 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 -; GFX7-NEXT: s_and_b32 s0, s1, s12 +; GFX7-NEXT: s_and_b32 s0, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s14 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s9, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s13 -; GFX7-NEXT: s_and_b32 s0, s2, s12 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s14 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: s_or_b32 s10, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s13 -; GFX7-NEXT: s_and_b32 s0, s3, s12 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s14 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX7-NEXT: s_lshr_b32 s11, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s11, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_or_b32 s11, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s12 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s12, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, 0xff +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s12, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5390,54 +5316,51 @@ ; GFX10-LABEL: insertelement_s_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xff ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 +; GFX10-NEXT: s_or_b32 s8, s8, s9 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_or_b32 s8, s0, s8 -; GFX10-NEXT: s_lshl_b32 s0, s17, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s0, s16, s0 -; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s9, s1, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s10, 24 -; GFX10-NEXT: s_bfe_u32 s2, s3, s6 +; GFX10-NEXT: s_or_b32 s9, s10, s12 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_or_b32 s10, s13, s14 +; GFX10-NEXT: s_or_b32 s8, s0, s5 +; GFX10-NEXT: s_lshl_b32 s0, s2, 16 +; GFX10-NEXT: s_or_b32 s9, s1, s6 +; GFX10-NEXT: s_or_b32 s0, s10, s0 +; GFX10-NEXT: s_lshl_b32 s1, s7, 24 +; GFX10-NEXT: s_bfe_u32 s2, s3, 0x80008 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_and_b32 s6, s3, s5 +; GFX10-NEXT: s_and_b32 s5, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 ; GFX10-NEXT: s_or_b32 s10, s0, s1 -; GFX10-NEXT: s_bfe_u32 s1, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s6, s2 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s0, s5, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 @@ -5448,7 +5371,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: s_or_b32 s11, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: s_and_b32 s2, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -5470,16 +5393,16 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5500,54 +5423,52 @@ ; GFX9-LABEL: insertelement_s_v16i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s12, 0x80008 -; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s13, s0, s12 -; GFX9-NEXT: s_and_b32 s11, s0, s10 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 -; GFX9-NEXT: s_or_b32 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s13, 0x80010 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s11, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s4, s0, s4 -; GFX9-NEXT: s_and_b32 s0, s1, s10 +; GFX9-NEXT: s_and_b32 s0, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s0, s0, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s5, 24 ; GFX9-NEXT: s_or_b32 s5, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s12 -; GFX9-NEXT: s_and_b32 s0, s2, s10 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s13 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s6, 24 ; GFX9-NEXT: s_or_b32 s6, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s12 -; GFX9-NEXT: s_and_b32 s0, s3, s10 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s13 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s7, 24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_or_b32 s7, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5560,43 +5481,44 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s8, 8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: s_mov_b32 s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v8, v0, s10, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_and_or_b32 v9, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s10, v4 +; GFX9-NEXT: v_and_or_b32 v4, v1, v5, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v5, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -5606,54 +5528,52 @@ ; GFX8-LABEL: insertelement_s_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s10, 0x80008 -; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s11, s0, s10 -; GFX8-NEXT: s_and_b32 s9, s0, s8 -; GFX8-NEXT: s_lshl_b32 s11, s11, 8 -; GFX8-NEXT: s_or_b32 s9, s9, s11 -; GFX8-NEXT: s_mov_b32 s11, 0x80010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s9, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_bfe_u32 s9, s1, s10 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 ; GFX8-NEXT: s_or_b32 s4, s0, s4 -; GFX8-NEXT: s_and_b32 s0, s1, s8 +; GFX8-NEXT: s_and_b32 s0, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s11 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s0, s0, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s5, 24 ; GFX8-NEXT: s_or_b32 s5, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s10 -; GFX8-NEXT: s_and_b32 s0, s2, s8 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s11 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s6, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s10 -; GFX8-NEXT: s_and_b32 s0, s3, s8 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s11 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_or_b32 s7, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5717,54 +5637,52 @@ ; GFX7-LABEL: insertelement_s_v16i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s10, 0x80008 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s11, s0, s10 -; GFX7-NEXT: s_and_b32 s9, s0, s8 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_or_b32 s9, s9, s11 -; GFX7-NEXT: s_mov_b32 s11, 0x80010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s11 +; GFX7-NEXT: s_and_b32 s9, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s9, s9, s10 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s9, s1, s10 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 ; GFX7-NEXT: s_or_b32 s4, s0, s4 -; GFX7-NEXT: s_and_b32 s0, s1, s8 +; GFX7-NEXT: s_and_b32 s0, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s11 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s0, s0, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s5, 24 ; GFX7-NEXT: s_or_b32 s5, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s10 -; GFX7-NEXT: s_and_b32 s0, s2, s8 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s11 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX7-NEXT: s_lshr_b32 s6, s2, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s6, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s10 -; GFX7-NEXT: s_and_b32 s0, s3, s8 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s11 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX7-NEXT: s_lshr_b32 s7, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_or_b32 s7, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5779,55 +5697,56 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, 0xff +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5838,65 +5757,62 @@ ; GFX10-LABEL: insertelement_s_v16i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x80008 -; GFX10-NEXT: s_movk_i32 s8, 0xff -; GFX10-NEXT: s_mov_b32 s9, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s8 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s12, s0, s7 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s8 -; GFX10-NEXT: s_bfe_u32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s11, s11, s12 -; GFX10-NEXT: s_bfe_u32 s16, s2, s7 -; GFX10-NEXT: s_lshl_b32 s4, s4, 24 -; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_bfe_u32 s14, s1, s7 -; GFX10-NEXT: s_and_b32 s15, s2, s8 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_or_b32 s4, s0, s4 -; GFX10-NEXT: s_bfe_u32 s0, s2, s9 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_and_b32 s13, s1, s8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_lshr_b32 s6, s2, 24 -; GFX10-NEXT: s_or_b32 s2, s15, s16 +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_lshl_b32 s4, s4, 24 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s12, s13, s14 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_lshl_b32 s2, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s12, s1 -; GFX10-NEXT: s_lshl_b32 s5, s5, 24 -; GFX10-NEXT: s_or_b32 s6, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s3, s7 -; GFX10-NEXT: s_or_b32 s5, s1, s5 -; GFX10-NEXT: s_and_b32 s1, s3, s8 +; GFX10-NEXT: s_or_b32 s9, s10, s11 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_or_b32 s4, s0, s4 +; GFX10-NEXT: s_lshl_b32 s0, s5, 24 +; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008 +; GFX10-NEXT: s_or_b32 s5, s1, s0 +; GFX10-NEXT: s_bfe_u32 s0, s3, 0x80008 +; GFX10-NEXT: s_and_b32 s1, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_or_b32 s0, s1, s0 -; GFX10-NEXT: s_bfe_u32 s1, s3, s9 -; GFX10-NEXT: s_lshr_b32 s10, s3, 24 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s10, s12, s13 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 -; GFX10-NEXT: s_lshl_b32 s2, s10, 24 +; GFX10-NEXT: s_lshr_b32 s7, s3, 24 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 ; GFX10-NEXT: s_mov_b32 s3, 8 +; GFX10-NEXT: s_or_b32 s6, s2, s6 +; GFX10-NEXT: s_lshl_b32 s2, s7, 24 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: s_or_b32 s7, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 ; GFX10-NEXT: v_and_or_b32 v5, v2, v1, v0 @@ -5917,16 +5833,16 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v2, s8, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5948,171 +5864,172 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX9-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX9-NEXT: v_and_or_b32 v14, v5, v0, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 -; GFX9-NEXT: s_and_b32 s0, s2, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v8, v12, v15, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v18 +; GFX9-NEXT: v_lshlrev_b32_e64 v18, v2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX9-NEXT: v_or3_b32 v9, v14, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v9, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v16 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, v2, v18 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v8, v8, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v6 +; GFX9-NEXT: v_or3_b32 v1, v8, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v9, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v7, v12 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_and_b32 s1, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v9, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v14, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e64 v18, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v7, v14, v17 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v16 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v8 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i8_s_v: @@ -6121,108 +6038,109 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v7, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_and_b32 s0, s2, s6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX7-NEXT: v_lshl_b32_e32 v18, s0, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 -; GFX7-NEXT: v_lshl_b32_e32 v2, s6, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_lshl_b32_e32 v19, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v19 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6233,7 +6151,6 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 @@ -6248,25 +6165,25 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v16 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 ; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v18 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 ; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 -; GFX10-NEXT: s_and_b32 s1, s2, s3 -; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, s3 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, 0xff ; GFX10-NEXT: v_or3_b32 v6, v6, v12, v8 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v5, s0 @@ -6291,13 +6208,13 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v10 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 +; GFX10-NEXT: v_and_or_b32 v14, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -6319,95 +6236,95 @@ ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s6, s2 -; GFX9-NEXT: s_not_b32 s5, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX9-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v14, v5, v0, v16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v8, v12, v15, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v18 +; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_or3_b32 v9, v14, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, s5, v2 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v9, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, s5, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v8, v8, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v6 +; GFX9-NEXT: v_or3_b32 v1, v8, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v9, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v7, v12 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v11, s0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 @@ -6490,108 +6407,109 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v7, 0xff ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 +; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc -; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v2, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6602,10 +6520,9 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: s_lshr_b32 s3, s2, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v4 @@ -6615,34 +6532,34 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v11 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v13 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v15 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v3, v3, v12, v7 ; GFX10-NEXT: v_or3_b32 v4, v4, v14, v8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v17 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v10 ; GFX10-NEXT: v_or3_b32 v5, v5, v16, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: v_or3_b32 v6, v6, v11, v7 ; GFX10-NEXT: s_lshl_b32 s2, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, v5, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 ; GFX10-NEXT: v_and_or_b32 v2, v7, s2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 @@ -6659,13 +6576,13 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v10 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, v12 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v5, s3, v14 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v5, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -6969,50 +6886,48 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v8, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff -; GFX10-NEXT: v_mov_b32_e32 v9, 16 +; GFX10-NEXT: v_mov_b32_e32 v8, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v4, v4, s2, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v5, v5, 0xff, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v5, v5, s2, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v6, v1, v18 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX10-NEXT: v_or3_b32 v4, v4, v15, v10 -; GFX10-NEXT: v_or3_b32 v5, v5, v17, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v13 -; GFX10-NEXT: v_or3_b32 v6, v6, v19, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc_lo +; GFX10-NEXT: v_or3_b32 v4, v4, v14, v9 +; GFX10-NEXT: v_or3_b32 v5, v5, v16, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v7, 0xff, v7, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12 +; GFX10-NEXT: v_or3_b32 v6, v6, v18, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, v0, v1 -; GFX10-NEXT: v_or3_b32 v7, v7, v14, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v0, 0xff +; GFX10-NEXT: v_or3_b32 v7, v7, v13, v9 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v6, s0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v7, s1 -; GFX10-NEXT: v_and_or_b32 v0, v10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v6, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1 +; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 @@ -7020,29 +6935,29 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v14, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v0, v2, v12, v11 -; GFX10-NEXT: v_or3_b32 v1, v3, v14, v6 -; GFX10-NEXT: v_or3_b32 v2, v13, v16, v7 -; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 +; GFX10-NEXT: v_or3_b32 v0, v2, v11, v10 +; GFX10-NEXT: v_or3_b32 v1, v3, v13, v6 +; GFX10-NEXT: v_or3_b32 v2, v12, v15, v7 +; GFX10-NEXT: v_or3_b32 v3, v14, v8, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -1261,10 +1261,9 @@ ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -505,15 +505,14 @@ ; ; GFX10-LABEL: atomic_add_i32_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -549,15 +548,14 @@ ; ; GFX10-LABEL: atomic_add_i32_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -632,15 +630,14 @@ ; ; GFX10-LABEL: atomic_add_i32_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -676,15 +673,14 @@ ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -720,14 +716,13 @@ ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1279,15 +1274,14 @@ ; ; GFX10-LABEL: atomic_add_i64_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1323,15 +1317,14 @@ ; ; GFX10-LABEL: atomic_add_i64_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1406,15 +1399,14 @@ ; ; GFX10-LABEL: atomic_add_i64_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1450,15 +1442,14 @@ ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1494,14 +1485,13 @@ ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, v5 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -86,7 +86,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -100,8 +99,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -143,7 +142,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -157,8 +155,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -252,7 +250,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -266,8 +263,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -309,7 +306,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -323,8 +319,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -366,7 +362,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -380,8 +375,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -423,7 +418,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -437,8 +431,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -481,7 +475,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -495,9 +488,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -540,7 +533,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -554,9 +546,9 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v5, s12 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10NSA-NEXT: v_and_or_b32 v3, 0xffff, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -592,7 +584,6 @@ ; ; GFX10NSA-LABEL: gather4_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 @@ -601,8 +592,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 @@ -643,7 +634,6 @@ ; ; GFX10NSA-LABEL: gather4_c_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 @@ -652,8 +642,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -564,15 +564,14 @@ ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -24,14 +24,13 @@ ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -79,17 +78,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -148,17 +146,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -24,15 +24,14 @@ ; ; GFX10-LABEL: load_3d_v4f32_xyzw: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -79,7 +78,6 @@ ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -90,8 +88,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -148,7 +146,6 @@ ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -159,8 +156,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -4,10 +4,9 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -19,11 +18,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -37,14 +35,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10-NEXT: v_and_or_b32 v2, v0, v11, v1 -; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v11, s12 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -56,10 +53,9 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -71,11 +67,10 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -87,10 +82,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -102,11 +96,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -118,10 +111,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -136,11 +128,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -152,10 +143,9 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -167,11 +157,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -183,10 +172,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -198,11 +186,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -214,10 +201,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -229,11 +215,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -245,10 +230,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -263,11 +247,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -284,11 +267,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -305,11 +287,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -49,15 +49,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GCN-NEXT: v_and_b32_e32 v10, s4, v7 -; GCN-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_and_or_b32 v5, v5, s4, v9 -; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 +; GCN-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 +; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog @@ -101,15 +100,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, s4, v8 -; GCN-NEXT: v_and_b32_e32 v9, s4, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 -; GCN-NEXT: v_and_or_b32 v7, v7, s4, v11 +; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GCN-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog @@ -202,21 +200,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v13, v0 ; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v7 +; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX1030-NEXT: v_mov_b32_e32 v15, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, s0, v8 +; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v17, v4 ; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v18, v5, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v1 +; GFX1030-NEXT: v_and_or_b32 v18, v5, 0xffff, v0 +; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v1 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 @@ -246,16 +243,15 @@ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GFX1013-NEXT: v_and_b32_e32 v14, s0, v7 -; GFX1013-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX1013-NEXT: v_and_or_b32 v5, v5, s0, v13 -; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v14 +; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v13 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 @@ -371,21 +367,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v14, v0 ; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 +; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, s0, v9 +; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 ; GFX1030-NEXT: v_mov_b32_e32 v19, v5 ; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 -; GFX1030-NEXT: v_and_or_b32 v20, v6, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v1 +; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0 +; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 @@ -417,20 +412,19 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_mov_b32_e32 v16, v10 ; GFX1013-NEXT: v_mov_b32_e32 v17, v11 ; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v11, s0, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX1013-NEXT: v_mov_b32_e32 v18, v12 ; GFX1013-NEXT: v_mov_b32_e32 v19, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v10 -; GFX1013-NEXT: v_and_or_b32 v7, v7, s0, v11 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -752,14 +752,13 @@ ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 -; GFX6-NEXT: s_mov_b32 s4, 0x80000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s3, s4 -; GFX6-NEXT: s_bfe_i32 s3, s3, s4 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -66,17 +66,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -66,17 +66,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -115,9 +115,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, %amount @@ -631,9 +630,8 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) { ; GFX6-LABEL: lshr_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -659,9 +657,8 @@ define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) { ; GFX6-LABEL: lshr_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -757,9 +754,8 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s3 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -768,14 +764,13 @@ ; ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s1, s2, s4 +; GFX8-NEXT: s_lshr_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -808,10 +803,10 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -844,10 +839,10 @@ ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -944,13 +939,12 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s5 -; GFX6-NEXT: s_and_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s3, s3, s7 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, s6 @@ -961,36 +955,34 @@ ; ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s2, s4, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 -; GFX8-NEXT: s_lshr_b32 s3, s5, s8 +; GFX8-NEXT: s_lshr_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s6 +; GFX9-NEXT: s_lshr_b32 s2, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s5 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 @@ -999,17 +991,16 @@ ; ; GFX10-LABEL: s_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, s6 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s2, s4, s5 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s5, s4 +; GFX10-NEXT: s_lshr_b32 s3, s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog @@ -1124,21 +1115,20 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s16, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s16 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s9 -; GFX6-NEXT: s_and_b32 s3, s3, s16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s2, s2, s16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s5, s5, s16 -; GFX6-NEXT: s_and_b32 s7, s7, s16 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 -; GFX6-NEXT: s_and_b32 s4, s4, s16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_lshr_b32 s5, s5, s13 -; GFX6-NEXT: s_and_b32 s6, s6, s16 +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s7, s7, s15 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 @@ -1153,64 +1143,62 @@ ; ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s4, s8, s12 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 -; GFX8-NEXT: s_lshr_b32 s5, s9, s14 +; GFX8-NEXT: s_lshr_b32 s5, s9, s13 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 -; GFX8-NEXT: s_lshr_b32 s6, s10, s15 +; GFX8-NEXT: s_lshr_b32 s6, s10, s14 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 -; GFX8-NEXT: s_lshr_b32 s7, s11, s16 +; GFX8-NEXT: s_lshr_b32 s7, s11, s15 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s10 +; GFX9-NEXT: s_lshr_b32 s4, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s6, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 ; GFX9-NEXT: s_lshr_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 @@ -1219,26 +1207,25 @@ ; ; GFX10-LABEL: s_lshr_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s8 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s4, s9, s10 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s8 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s8, s9 +; GFX10-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s5, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s5, s9, s10 +; GFX10-NEXT: s_lshr_b32 s5, s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s6, s7, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s7 ; GFX10-NEXT: s_lshr_b32 s5, s5, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -12,25 +12,22 @@ ; ; GFX8-LABEL: s_mul_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -78,29 +75,26 @@ ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16_zeroext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result @@ -146,27 +140,24 @@ ; ; GFX8-LABEL: s_mul_i16_signext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_signext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16_signext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -429,13 +429,12 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog @@ -487,13 +485,12 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s4, s1 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_and_b32 s1, s6, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: @@ -630,18 +626,17 @@ ; GFX6-LABEL: s_orn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -673,18 +668,17 @@ ; GFX6-LABEL: s_orn2_v4i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -716,18 +710,17 @@ ; GFX6-LABEL: s_orn2_v4i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s14, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, s14 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s14 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, s14 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, s14 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, s14 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, s14 +; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -401,13 +401,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_rndne_f16_e32 v3, v0 +; GFX10-NEXT: v_rndne_f16_e32 v2, v0 ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_rndne_f16_e32 v4, v1 +; GFX10-NEXT: v_rndne_f16_e32 v3, v1 ; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v3, v2, v0 -; GFX10-NEXT: v_and_or_b32 v1, v4, v2, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) ret <4 x half> %roundeven @@ -610,8 +609,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: s_mov_b32 s7, 0x43300000 ; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 +; GFX6-NEXT: s_mov_b32 s7, 0x43300000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -320,12 +320,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -345,31 +344,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s1, s7, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s6 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s1, s5, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s4 ; GFX6-NEXT: s_add_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -379,32 +375,30 @@ ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, 0 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: s_max_i32 s9, s7, s8 -; GFX8-NEXT: s_min_i32 s7, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, 0 +; GFX8-NEXT: s_max_i32 s7, s5, s6 +; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s7, s6, s7 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s9, s5, s9 -; GFX8-NEXT: s_max_i32 s1, s7, s1 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 +; GFX8-NEXT: s_max_i32 s1, s5, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s7, s9 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_sext_i32_i16 s5, s7 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s7, s3, s8 -; GFX8-NEXT: s_min_i32 s3, s3, s8 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, s7 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 @@ -413,10 +407,9 @@ ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -424,17 +417,16 @@ ; GFX9-LABEL: s_saddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp @@ -451,15 +443,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] @@ -639,35 +630,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_add_i16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -685,59 +674,56 @@ ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s1, s11, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s10 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s1, s9, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s8 ; GFX6-NEXT: s_add_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s2, s10, s2 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s2, s8, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s3, s6, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s5 ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_min_i32 s6, s3, 0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s4, s6, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s5 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -749,48 +735,46 @@ ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, 0 -; GFX8-NEXT: s_movk_i32 s10, 0x8000 -; GFX8-NEXT: s_max_i32 s13, s11, s12 -; GFX8-NEXT: s_min_i32 s11, s11, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, 0 +; GFX8-NEXT: s_max_i32 s11, s9, s10 +; GFX8-NEXT: s_min_i32 s9, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s11, s10, s11 -; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_sub_i32 s9, 0xffff8000, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s13, s9, s13 -; GFX8-NEXT: s_max_i32 s1, s11, s1 +; GFX8-NEXT: s_sub_i32 s11, 0x7fff, s11 +; GFX8-NEXT: s_max_i32 s1, s9, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s11, s13 -; GFX8-NEXT: s_min_i32 s1, s1, s11 +; GFX8-NEXT: s_sext_i32_i16 s9, s11 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s11, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s5, s11 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, s9, s6 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s3, s5, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 @@ -798,35 +782,34 @@ ; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s9, s6 -; GFX8-NEXT: s_max_i32 s4, s5, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_ashr_i32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_add_i32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -838,27 +821,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp @@ -885,39 +867,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1269,19 +1249,17 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s2, s7, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_min_i32 s5, s0, 0 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s2, s5, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, 0 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 @@ -1289,19 +1267,17 @@ ; ; GFX8-LABEL: s_saddsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_min_i32 s7, s0, 0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: s_max_i32 s6, s0, 0 -; GFX8-NEXT: s_sub_i32 s7, s5, s7 -; GFX8-NEXT: s_sub_i32 s6, s4, s6 -; GFX8-NEXT: s_max_i32 s2, s7, s2 -; GFX8-NEXT: s_min_i32 s2, s2, s6 +; GFX8-NEXT: s_min_i32 s5, s0, 0 +; GFX8-NEXT: s_max_i32 s4, s0, 0 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX8-NEXT: s_max_i32 s2, s5, s2 +; GFX8-NEXT: s_min_i32 s2, s2, s4 +; GFX8-NEXT: s_min_i32 s4, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, 0 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_min_i32 s4, s1, 0 -; GFX8-NEXT: s_sub_i32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s2, s3, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -1408,26 +1384,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s7, 1 -; GFX6-NEXT: s_min_i32 s9, s0, 0 -; GFX6-NEXT: s_brev_b32 s6, -2 -; GFX6-NEXT: s_max_i32 s8, s0, 0 -; GFX6-NEXT: s_sub_i32 s9, s7, s9 -; GFX6-NEXT: s_sub_i32 s8, s6, s8 -; GFX6-NEXT: s_max_i32 s3, s9, s3 -; GFX6-NEXT: s_min_i32 s3, s3, s8 -; GFX6-NEXT: s_min_i32 s8, s1, 0 +; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_max_i32 s6, s0, 0 +; GFX6-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX6-NEXT: s_max_i32 s3, s7, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s6 +; GFX6-NEXT: s_min_i32 s6, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s8, s7, s8 -; GFX6-NEXT: s_sub_i32 s3, s6, s3 -; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 +; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s4, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, 0 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s3, s6, s3 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_add_i32 s2, s2, s3 @@ -1435,26 +1409,24 @@ ; ; GFX8-LABEL: s_saddsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s7, 1 -; GFX8-NEXT: s_min_i32 s9, s0, 0 -; GFX8-NEXT: s_brev_b32 s6, -2 -; GFX8-NEXT: s_max_i32 s8, s0, 0 -; GFX8-NEXT: s_sub_i32 s9, s7, s9 -; GFX8-NEXT: s_sub_i32 s8, s6, s8 -; GFX8-NEXT: s_max_i32 s3, s9, s3 -; GFX8-NEXT: s_min_i32 s3, s3, s8 -; GFX8-NEXT: s_min_i32 s8, s1, 0 +; GFX8-NEXT: s_min_i32 s7, s0, 0 +; GFX8-NEXT: s_max_i32 s6, s0, 0 +; GFX8-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX8-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX8-NEXT: s_max_i32 s3, s7, s3 +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_min_i32 s6, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, 0 -; GFX8-NEXT: s_sub_i32 s8, s7, s8 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 -; GFX8-NEXT: s_max_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s4, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, 0 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 +; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_add_i32 s2, s2, s3 @@ -1582,33 +1554,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s4, s11, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s10 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s4, s9, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s8 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_max_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s5, s8, s5 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 @@ -1616,33 +1586,31 @@ ; ; GFX8-LABEL: s_saddsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s9, 1 -; GFX8-NEXT: s_min_i32 s11, s0, 0 -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_max_i32 s10, s0, 0 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 -; GFX8-NEXT: s_sub_i32 s10, s8, s10 -; GFX8-NEXT: s_max_i32 s4, s11, s4 -; GFX8-NEXT: s_min_i32 s4, s4, s10 -; GFX8-NEXT: s_min_i32 s10, s1, 0 +; GFX8-NEXT: s_min_i32 s9, s0, 0 +; GFX8-NEXT: s_max_i32 s8, s0, 0 +; GFX8-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX8-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX8-NEXT: s_max_i32 s4, s9, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s8 +; GFX8-NEXT: s_min_i32 s8, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, 0 -; GFX8-NEXT: s_sub_i32 s10, s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 -; GFX8-NEXT: s_max_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX8-NEXT: s_max_i32 s5, s8, s5 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_add_i32 s3, s3, s4 @@ -1704,21 +1672,20 @@ ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1748,21 +1715,20 @@ ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 @@ -1795,40 +1761,38 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s11, 1 -; GFX6-NEXT: s_min_i32 s13, s0, 0 -; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_max_i32 s12, s0, 0 -; GFX6-NEXT: s_sub_i32 s13, s11, s13 -; GFX6-NEXT: s_sub_i32 s12, s10, s12 -; GFX6-NEXT: s_max_i32 s5, s13, s5 -; GFX6-NEXT: s_min_i32 s5, s5, s12 -; GFX6-NEXT: s_min_i32 s12, s1, 0 +; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_max_i32 s10, s0, 0 +; GFX6-NEXT: s_sub_i32 s11, 0x80000000, s11 +; GFX6-NEXT: s_sub_i32 s10, 0x7fffffff, s10 +; GFX6-NEXT: s_max_i32 s5, s11, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s10 +; GFX6-NEXT: s_min_i32 s10, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s12, s11, s12 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 -; GFX6-NEXT: s_max_i32 s6, s12, s6 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s6, s10, s6 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s8 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s9 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_add_i32 s4, s4, s5 @@ -1836,40 +1800,38 @@ ; ; GFX8-LABEL: s_saddsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s11, 1 -; GFX8-NEXT: s_min_i32 s13, s0, 0 -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_max_i32 s12, s0, 0 -; GFX8-NEXT: s_sub_i32 s13, s11, s13 -; GFX8-NEXT: s_sub_i32 s12, s10, s12 -; GFX8-NEXT: s_max_i32 s5, s13, s5 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_min_i32 s12, s1, 0 +; GFX8-NEXT: s_min_i32 s11, s0, 0 +; GFX8-NEXT: s_max_i32 s10, s0, 0 +; GFX8-NEXT: s_sub_i32 s11, 0x80000000, s11 +; GFX8-NEXT: s_sub_i32 s10, 0x7fffffff, s10 +; GFX8-NEXT: s_max_i32 s5, s11, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_min_i32 s10, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, 0 -; GFX8-NEXT: s_sub_i32 s12, s11, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 -; GFX8-NEXT: s_max_i32 s6, s12, s6 +; GFX8-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX8-NEXT: s_max_i32 s6, s10, s6 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s8 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s4, 0 ; GFX8-NEXT: s_add_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_add_i32 s4, s4, s5 @@ -2211,117 +2173,115 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s33, 1 -; GFX6-NEXT: s_min_i32 s35, s0, 0 -; GFX6-NEXT: s_brev_b32 s32, -2 -; GFX6-NEXT: s_max_i32 s34, s0, 0 -; GFX6-NEXT: s_sub_i32 s35, s33, s35 -; GFX6-NEXT: s_sub_i32 s34, s32, s34 -; GFX6-NEXT: s_max_i32 s16, s35, s16 -; GFX6-NEXT: s_min_i32 s16, s16, s34 -; GFX6-NEXT: s_min_i32 s34, s1, 0 +; GFX6-NEXT: s_min_i32 s33, s0, 0 +; GFX6-NEXT: s_max_i32 s32, s0, 0 +; GFX6-NEXT: s_sub_i32 s33, 0x80000000, s33 +; GFX6-NEXT: s_sub_i32 s32, 0x7fffffff, s32 +; GFX6-NEXT: s_max_i32 s16, s33, s16 +; GFX6-NEXT: s_min_i32 s16, s16, s32 +; GFX6-NEXT: s_min_i32 s32, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, 0 -; GFX6-NEXT: s_sub_i32 s34, s33, s34 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 -; GFX6-NEXT: s_max_i32 s17, s34, s17 +; GFX6-NEXT: s_sub_i32 s32, 0x80000000, s32 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX6-NEXT: s_max_i32 s17, s32, s17 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s18 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s19 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s20 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s5, 0 ; GFX6-NEXT: s_add_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s21 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s6, 0 ; GFX6-NEXT: s_add_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s22 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s7, 0 ; GFX6-NEXT: s_add_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s23 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s8, 0 ; GFX6-NEXT: s_add_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s24 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s9, 0 ; GFX6-NEXT: s_add_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s25 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s10, 0 ; GFX6-NEXT: s_add_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s26 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s11, 0 ; GFX6-NEXT: s_add_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s27 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s12, 0 ; GFX6-NEXT: s_add_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s28 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s13, 0 ; GFX6-NEXT: s_add_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s29 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s14, 0 ; GFX6-NEXT: s_add_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s30 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s15, 0 ; GFX6-NEXT: s_add_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s31 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_add_i32 s15, s15, s16 @@ -2329,117 +2289,115 @@ ; ; GFX8-LABEL: s_saddsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s33, 1 -; GFX8-NEXT: s_min_i32 s35, s0, 0 -; GFX8-NEXT: s_brev_b32 s32, -2 -; GFX8-NEXT: s_max_i32 s34, s0, 0 -; GFX8-NEXT: s_sub_i32 s35, s33, s35 -; GFX8-NEXT: s_sub_i32 s34, s32, s34 -; GFX8-NEXT: s_max_i32 s16, s35, s16 -; GFX8-NEXT: s_min_i32 s16, s16, s34 -; GFX8-NEXT: s_min_i32 s34, s1, 0 +; GFX8-NEXT: s_min_i32 s33, s0, 0 +; GFX8-NEXT: s_max_i32 s32, s0, 0 +; GFX8-NEXT: s_sub_i32 s33, 0x80000000, s33 +; GFX8-NEXT: s_sub_i32 s32, 0x7fffffff, s32 +; GFX8-NEXT: s_max_i32 s16, s33, s16 +; GFX8-NEXT: s_min_i32 s16, s16, s32 +; GFX8-NEXT: s_min_i32 s32, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, 0 -; GFX8-NEXT: s_sub_i32 s34, s33, s34 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 -; GFX8-NEXT: s_max_i32 s17, s34, s17 +; GFX8-NEXT: s_sub_i32 s32, 0x80000000, s32 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX8-NEXT: s_max_i32 s17, s32, s17 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s18 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s19 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s4, 0 ; GFX8-NEXT: s_add_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s20 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s5, 0 ; GFX8-NEXT: s_add_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s21 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s6, 0 ; GFX8-NEXT: s_add_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s22 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s7, 0 ; GFX8-NEXT: s_add_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s23 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s8, 0 ; GFX8-NEXT: s_add_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s24 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s9, 0 ; GFX8-NEXT: s_add_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s25 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s10, 0 ; GFX8-NEXT: s_add_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s26 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s11, 0 ; GFX8-NEXT: s_add_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s27 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s12, 0 ; GFX8-NEXT: s_add_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s28 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s13, 0 ; GFX8-NEXT: s_add_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s29 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s14, 0 ; GFX8-NEXT: s_add_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s30 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s15, 0 ; GFX8-NEXT: s_add_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s31 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_add_i32 s15, s15, s16 @@ -2781,60 +2739,55 @@ ; GFX6-LABEL: s_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s2, s7, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s2, s5, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, 0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: s_max_i32 s8, s6, s7 -; GFX8-NEXT: s_min_i32 s6, s6, s7 -; GFX8-NEXT: s_sub_i32 s6, s5, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, 0 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s8, s4, s8 -; GFX8-NEXT: s_max_i32 s1, s6, s1 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s6, s8 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s6, s1, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s7 -; GFX8-NEXT: s_sub_i32 s1, s5, s1 +; GFX8-NEXT: s_max_i32 s4, s1, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 +; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, s6 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 @@ -2867,22 +2820,20 @@ ; GFX6-LABEL: saddsat_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: s_min_i32 s5, s0, 0 +; GFX6-NEXT: s_min_i32 s3, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_max_i32 s4, s0, 0 -; GFX6-NEXT: s_sub_i32 s5, s3, s5 -; GFX6-NEXT: s_sub_i32 s4, s2, s4 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: s_max_i32 s2, s0, 0 +; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: s_max_i32 s1, s0, 0 -; GFX6-NEXT: s_sub_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s2, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s2, s3, s2 +; GFX6-NEXT: s_max_i32 s1, s0, 0 +; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 +; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 ; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 @@ -2897,25 +2848,23 @@ ; ; GFX8-LABEL: saddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, 0 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, 0 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_sub_i32 s4, s3, s4 -; GFX8-NEXT: s_sub_i32 s6, s2, s6 -; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: v_min_i16_e32 v1, s6, v1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: v_max_i16_e32 v1, s2, v0 +; GFX8-NEXT: s_sext_i32_i16 s2, s1 +; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 +; GFX8-NEXT: v_min_i16_e32 v0, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3137,31 +3086,29 @@ ; GFX6-LABEL: s_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s4, s11, s4 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s4, s9, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s10 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s4, s4, s8 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s4, s10, s4 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s5 @@ -3169,65 +3116,62 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, 0 -; GFX8-NEXT: s_movk_i32 s9, 0x8000 -; GFX8-NEXT: s_max_i32 s12, s10, s11 -; GFX8-NEXT: s_min_i32 s10, s10, s11 -; GFX8-NEXT: s_sub_i32 s10, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, 0 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s12, s8, s12 -; GFX8-NEXT: s_max_i32 s2, s10, s2 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 +; GFX8-NEXT: s_max_i32 s2, s8, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s10, s12 +; GFX8-NEXT: s_sext_i32_i16 s8, s10 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s10 +; GFX8-NEXT: s_min_i32 s2, s2, s8 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s10, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s8, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s10, s8, s10 +; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 ; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s10 +; GFX8-NEXT: s_sext_i32_i16 s6, s8 ; GFX8-NEXT: s_min_i32 s2, s2, s6 ; GFX8-NEXT: s_add_i32 s4, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s1 -; GFX8-NEXT: s_max_i32 s6, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s6, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, s8, s6 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 @@ -3235,12 +3179,12 @@ ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s5 -; GFX8-NEXT: s_max_i32 s3, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s3, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s8, s3 +; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3454,31 +3398,29 @@ ; GFX6-LABEL: s_saddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s13, 1 -; GFX6-NEXT: s_min_i32 s15, s0, 0 +; GFX6-NEXT: s_min_i32 s13, s0, 0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_brev_b32 s12, -2 -; GFX6-NEXT: s_max_i32 s14, s0, 0 -; GFX6-NEXT: s_sub_i32 s15, s13, s15 -; GFX6-NEXT: s_sub_i32 s14, s12, s14 -; GFX6-NEXT: s_max_i32 s6, s15, s6 +; GFX6-NEXT: s_max_i32 s12, s0, 0 +; GFX6-NEXT: s_sub_i32 s13, 0x80000000, s13 +; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12 +; GFX6-NEXT: s_max_i32 s6, s13, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s14 -; GFX6-NEXT: s_min_i32 s14, s1, 0 +; GFX6-NEXT: s_min_i32 s6, s6, s12 +; GFX6-NEXT: s_min_i32 s12, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, 0 -; GFX6-NEXT: s_sub_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 -; GFX6-NEXT: s_max_i32 s6, s14, s6 +; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_max_i32 s6, s12, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s7 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 ; GFX6-NEXT: s_min_i32 s8, s2, 0 ; GFX6-NEXT: s_max_i32 s7, s2, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3486,8 +3428,8 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 ; GFX6-NEXT: s_max_i32 s7, s3, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3495,8 +3437,8 @@ ; GFX6-NEXT: s_add_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_max_i32 s7, s4, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3504,71 +3446,68 @@ ; GFX6-NEXT: s_add_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_max_i32 s7, s5, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 -; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s7 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, 0 -; GFX8-NEXT: s_movk_i32 s13, 0x8000 -; GFX8-NEXT: s_max_i32 s16, s14, s15 -; GFX8-NEXT: s_min_i32 s14, s14, s15 -; GFX8-NEXT: s_sub_i32 s14, s13, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_sext_i32_i16 s13, 0 +; GFX8-NEXT: s_max_i32 s14, s12, s13 +; GFX8-NEXT: s_min_i32 s12, s12, s13 +; GFX8-NEXT: s_sub_i32 s12, 0xffff8000, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_movk_i32 s12, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s16, s12, s16 -; GFX8-NEXT: s_max_i32 s3, s14, s3 +; GFX8-NEXT: s_sub_i32 s14, 0x7fff, s14 +; GFX8-NEXT: s_max_i32 s3, s12, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s14, s16 +; GFX8-NEXT: s_sext_i32_i16 s12, s14 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s14 +; GFX8-NEXT: s_min_i32 s3, s3, s12 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s14, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s12, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s14, s12, s14 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s9, s14 +; GFX8-NEXT: s_sext_i32_i16 s9, s12 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_add_i32 s6, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s9, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s9, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s9, s12, s9 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 @@ -3576,25 +3515,25 @@ ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s7 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s7, s7, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s2 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3602,12 +3541,12 @@ ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s8 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s11 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3861,31 +3800,29 @@ ; GFX6-LABEL: s_saddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s17, 1 -; GFX6-NEXT: s_min_i32 s19, s0, 0 +; GFX6-NEXT: s_min_i32 s17, s0, 0 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_brev_b32 s16, -2 -; GFX6-NEXT: s_max_i32 s18, s0, 0 -; GFX6-NEXT: s_sub_i32 s19, s17, s19 -; GFX6-NEXT: s_sub_i32 s18, s16, s18 -; GFX6-NEXT: s_max_i32 s8, s19, s8 +; GFX6-NEXT: s_max_i32 s16, s0, 0 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX6-NEXT: s_max_i32 s8, s17, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s18 -; GFX6-NEXT: s_min_i32 s18, s1, 0 +; GFX6-NEXT: s_min_i32 s8, s8, s16 +; GFX6-NEXT: s_min_i32 s16, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, 0 -; GFX6-NEXT: s_sub_i32 s18, s17, s18 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 -; GFX6-NEXT: s_max_i32 s8, s18, s8 +; GFX6-NEXT: s_sub_i32 s16, 0x80000000, s16 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_max_i32 s8, s16, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 ; GFX6-NEXT: s_min_i32 s10, s2, 0 ; GFX6-NEXT: s_max_i32 s9, s2, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3893,8 +3830,8 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 ; GFX6-NEXT: s_max_i32 s9, s3, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3902,8 +3839,8 @@ ; GFX6-NEXT: s_add_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 ; GFX6-NEXT: s_max_i32 s9, s4, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3911,8 +3848,8 @@ ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_max_i32 s9, s5, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3920,86 +3857,83 @@ ; GFX6-NEXT: s_add_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_max_i32 s9, s6, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s7, 0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_max_i32 s9, s7, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_max_i32 s8, s10, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s9 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, 0 -; GFX8-NEXT: s_movk_i32 s17, 0x8000 -; GFX8-NEXT: s_max_i32 s20, s18, s19 -; GFX8-NEXT: s_min_i32 s18, s18, s19 -; GFX8-NEXT: s_sub_i32 s18, s17, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s0 +; GFX8-NEXT: s_sext_i32_i16 s17, 0 +; GFX8-NEXT: s_max_i32 s18, s16, s17 +; GFX8-NEXT: s_min_i32 s16, s16, s17 +; GFX8-NEXT: s_sub_i32 s16, 0xffff8000, s16 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_movk_i32 s16, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s20, s16, s20 -; GFX8-NEXT: s_max_i32 s4, s18, s4 +; GFX8-NEXT: s_sub_i32 s18, 0x7fff, s18 +; GFX8-NEXT: s_max_i32 s4, s16, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s18, s20 +; GFX8-NEXT: s_sext_i32_i16 s16, s18 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s18 +; GFX8-NEXT: s_min_i32 s4, s4, s16 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s18, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s16, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s18, s16, s18 +; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16 ; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s12, s18 +; GFX8-NEXT: s_sext_i32_i16 s12, s16 ; GFX8-NEXT: s_min_i32 s4, s4, s12 ; GFX8-NEXT: s_add_i32 s8, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s12, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s12, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s12, s16, s12 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s12 @@ -4007,25 +3941,25 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s9, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s2 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4033,24 +3967,24 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s10 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s14 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s10, s10, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s3 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4058,13 +3992,13 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s11 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s15 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4553,13 +4487,12 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 +; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -4571,10 +4504,10 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 @@ -4596,13 +4529,12 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_mov_b32 s10, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 +; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4614,10 +4546,10 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4639,13 +4571,12 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_mov_b32 s10, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 +; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4657,10 +4588,10 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -4679,15 +4610,14 @@ ; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_mov_b32 s10, 0 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_xor_b32 s8, s4, s1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -4697,9 +4627,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 @@ -5453,10 +5383,9 @@ ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_brev_b32 s10, 1 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, s10 +; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 @@ -5495,7 +5424,7 @@ ; GFX6-NEXT: s_addc_u32 s5, s4, 0 ; GFX6-NEXT: s_addc_u32 s6, s4, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s4, s10 +; GFX6-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5550,10 +5479,9 @@ ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_brev_b32 s10, 1 ; GFX8-NEXT: s_addc_u32 s2, s0, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, s10 +; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 @@ -5598,7 +5526,7 @@ ; GFX8-NEXT: s_addc_u32 s5, s4, 0 ; GFX8-NEXT: s_addc_u32 s6, s4, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s4, s10 +; GFX8-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5653,10 +5581,9 @@ ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_brev_b32 s10, 1 ; GFX9-NEXT: s_addc_u32 s2, s0, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, s10 +; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 @@ -5701,7 +5628,7 @@ ; GFX9-NEXT: s_addc_u32 s5, s4, 0 ; GFX9-NEXT: s_addc_u32 s6, s4, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s4, s10 +; GFX9-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -5747,14 +5674,13 @@ ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s2, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: s_brev_b32 s11, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_addc_u32 s1, s2, 0 ; GFX10-NEXT: s_addc_u32 s10, s2, 0 -; GFX10-NEXT: s_addc_u32 s3, s2, s11 +; GFX10-NEXT: s_addc_u32 s3, s2, 0x80000000 ; GFX10-NEXT: s_add_u32 s12, s4, s12 ; GFX10-NEXT: s_addc_u32 s13, s5, s13 ; GFX10-NEXT: s_addc_u32 s18, s6, s14 @@ -5795,7 +5721,7 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s11 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -309,10 +309,10 @@ ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -408,15 +408,15 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc @@ -450,10 +450,10 @@ ; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -549,15 +549,15 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1186,9 +1186,8 @@ ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s10, 0x1000 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1317,7 +1316,7 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc @@ -1890,9 +1889,8 @@ ; GISEL-LABEL: v_sdiv_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -2021,7 +2019,7 @@ ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1157,121 +1157,120 @@ ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 -; GFX10-NEXT: s_ashr_i32 s2, s14, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s12, s14, s2 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 +; GFX10-NEXT: s_add_i32 s6, s12, s0 ; GFX10-NEXT: s_add_i32 s7, s13, s1 +; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 +; GFX10-NEXT: s_xor_b32 s14, s6, s0 ; GFX10-NEXT: s_xor_b32 s15, s7, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: s_xor_b32 s13, s13, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b32 s13, s13, s3 ; GFX10-NEXT: s_sub_i32 s6, 0, s14 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: s_sub_i32 s7, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_ashr_i32 s18, s10, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 +; GFX10-NEXT: s_ashr_i32 s18, s10, 31 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: s_xor_b32 s20, s16, s0 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: s_xor_b32 s21, s17, s1 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 ; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX10-NEXT: s_ashr_i32 s19, s11, 31 +; GFX10-NEXT: s_add_i32 s6, s8, s16 ; GFX10-NEXT: s_add_i32 s7, s9, s17 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: s_add_i32 s6, s8, s16 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_add_i32 s9, s11, s19 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX10-NEXT: s_add_i32 s9, s11, s19 +; GFX10-NEXT: s_xor_b32 s10, s6, s16 ; GFX10-NEXT: s_xor_b32 s11, s7, s17 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: s_xor_b32 s9, s9, s19 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 +; GFX10-NEXT: s_xor_b32 s9, s9, s19 +; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 ; GFX10-NEXT: s_xor_b32 s0, s19, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 ; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2 @@ -2817,7 +2816,6 @@ ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s10, 0x100010 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 @@ -2825,41 +2823,41 @@ ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX8-NEXT: s_sub_i32 s6, 0, s9 -; GFX8-NEXT: s_sext_i32_i16 s0, s2 -; GFX8-NEXT: s_bfe_i32 s1, s3, s10 +; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 +; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_ashr_i32 s3, s0, 31 -; GFX8-NEXT: s_ashr_i32 s11, s1, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s3 +; GFX8-NEXT: s_add_i32 s1, s1, s10 +; GFX8-NEXT: s_xor_b32 s11, s1, s10 +; GFX8-NEXT: s_sext_i32_i16 s0, s2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s1, s1, s11 -; GFX8-NEXT: s_xor_b32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s12, s1, s11 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX8-NEXT: s_ashr_i32 s3, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s12 +; GFX8-NEXT: s_xor_b32 s0, s0, s3 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s9 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2 -; GFX8-NEXT: s_sub_i32 s1, 0, s12 +; GFX8-NEXT: s_sub_i32 s1, 0, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX8-NEXT: s_bfe_i32 s1, s2, s10 +; GFX8-NEXT: s_bfe_i32 s1, s2, 0x100010 ; GFX8-NEXT: s_ashr_i32 s2, s1, 31 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -2870,19 +2868,19 @@ ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s3, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s12 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s3, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s11 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: s_xor_b32 s0, s2, s10 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 @@ -2907,45 +2905,44 @@ ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s10, 0x100010 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_bfe_i32 s1, s7, s10 -; GFX9-NEXT: s_ashr_i32 s7, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s7 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 +; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s11, s1, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_i32 s1, 0, s9 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_xor_b32 s5, s5, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: s_ashr_i32 s12, s0, 31 -; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s0, s0, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_xor_b32 s13, s0, s12 -; GFX9-NEXT: s_sub_i32 s0, 0, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_bfe_i32 s4, s6, s10 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_ashr_i32 s5, s4, 31 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2956,28 +2953,28 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s6, s12, s8 +; GFX9-NEXT: s_xor_b32 s6, s10, s8 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: s_xor_b32 s4, s5, s7 -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX9-NEXT: s_xor_b32 s4, s11, s7 +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s12, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v3 @@ -2990,19 +2987,18 @@ ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s2, 0x100010 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_bfe_i32 s1, s1, s2 -; GFX10-NEXT: s_ashr_i32 s8, s3, 31 -; GFX10-NEXT: s_ashr_i32 s9, s1, 31 -; GFX10-NEXT: s_add_i32 s3, s3, s8 -; GFX10-NEXT: s_add_i32 s1, s1, s9 -; GFX10-NEXT: s_xor_b32 s3, s3, s8 -; GFX10-NEXT: s_xor_b32 s1, s1, s9 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10-NEXT: s_sext_i32_i16 s2, s1 +; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_ashr_i32 s8, s1, 31 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s1, s1, s8 +; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s1, s8 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: s_sub_i32 s6, 0, s2 ; GFX10-NEXT: s_sub_i32 s7, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -3013,29 +3009,29 @@ ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX10-NEXT: s_sext_i32_i16 s6, s0 -; GFX10-NEXT: s_bfe_i32 s0, s0, s2 -; GFX10-NEXT: s_ashr_i32 s2, s6, 31 +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: s_add_i32 s6, s6, s2 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s3, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -3043,28 +3039,27 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s3, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: s_xor_b32 s1, s2, s8 +; GFX10-NEXT: s_xor_b32 s1, s9, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s10, s9 +; GFX10-NEXT: s_xor_b32 s0, s10, s8 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] @@ -3366,9 +3361,8 @@ ; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 -; GFX10-NEXT: s_mov_b32 s4, 0x7ffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -439,9 +439,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_brev_b32 s4, -4 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0x3fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5] @@ -523,9 +522,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_brev_b32 s4, -8 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -608,13 +606,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; GFX7-LABEL: s_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2 ; GFX7-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -785,25 +785,23 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -869,10 +867,10 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: shl_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff +; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 @@ -970,39 +968,37 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s2, s4, s7 +; GFX8-NEXT: s_lshl_b32 s2, s4, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 -; GFX8-NEXT: s_lshl_b32 s3, s5, s8 +; GFX8-NEXT: s_lshl_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1144,67 +1140,65 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s16, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, s10 ; GFX6-NEXT: s_lshl_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshl_b32 s5, s5, s13 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s16 -; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, s12 ; GFX6-NEXT: s_lshl_b32 s7, s7, s15 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s4, s8, s13 +; GFX8-NEXT: s_lshl_b32 s4, s8, s12 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 -; GFX8-NEXT: s_lshl_b32 s5, s9, s14 +; GFX8-NEXT: s_lshl_b32 s5, s9, s13 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s6, s10, s15 +; GFX8-NEXT: s_lshl_b32 s6, s10, s14 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 -; GFX8-NEXT: s_lshl_b32 s7, s11, s16 +; GFX8-NEXT: s_lshl_b32 s7, s11, s15 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -377,13 +377,13 @@ ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 @@ -508,13 +508,13 @@ ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1164,9 +1164,8 @@ ; GISEL-LABEL: v_srem_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s10, 0x1000 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1294,7 +1293,7 @@ ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc @@ -1860,9 +1859,8 @@ ; GISEL-LABEL: v_srem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1990,7 +1988,7 @@ ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -320,12 +320,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -345,31 +344,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s1, s6, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s1, s4, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -379,32 +375,30 @@ ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, -1 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_max_i32 s9, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, -1 +; GFX8-NEXT: s_max_i32 s7, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s9, s9, s5 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: s_min_i32 s7, s7, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s6 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s7, s7, s6 -; GFX8-NEXT: s_max_i32 s1, s9, s1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s7, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s7, s3, s8 -; GFX8-NEXT: s_sub_i32 s5, s7, s5 -; GFX8-NEXT: s_min_i32 s3, s3, s8 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -413,10 +407,9 @@ ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -424,17 +417,16 @@ ; GFX9-LABEL: s_ssubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp @@ -451,15 +443,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] @@ -639,35 +630,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_sub_i16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -685,59 +674,56 @@ ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s1, s10, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s11 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s1, s8, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s5, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s10 +; GFX6-NEXT: s_min_i32 s2, s2, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s5, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_max_i32 s5, s3, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 -; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s6 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -749,35 +735,33 @@ ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, -1 -; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_max_i32 s13, s11, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, -1 +; GFX8-NEXT: s_max_i32 s11, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s13, s13, s9 -; GFX8-NEXT: s_movk_i32 s10, 0x8000 -; GFX8-NEXT: s_min_i32 s11, s11, s12 -; GFX8-NEXT: s_sext_i32_i16 s13, s13 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x7fff +; GFX8-NEXT: s_min_i32 s9, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s11, s11 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s11, s11, s10 -; GFX8-NEXT: s_max_i32 s1, s13, s1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s11, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s11, s11 -; GFX8-NEXT: s_min_i32 s1, s1, s11 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s11, s5, s12 -; GFX8-NEXT: s_sub_i32 s11, s11, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_max_i32 s2, s11, s2 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s2, s9, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s5 @@ -785,12 +769,12 @@ ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_sub_i32 s6, s6, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -798,35 +782,34 @@ ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_sub_i32 s6, s6, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_max_i32 s4, s6, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_ashr_i32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_sub_i32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -838,27 +821,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp @@ -885,39 +867,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1255,19 +1235,17 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s2, s6, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_max_i32 s4, s0, -1 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s2, s4, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s2, s3 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -1275,19 +1253,17 @@ ; ; GFX8-LABEL: s_ssubsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: s_max_i32 s6, s0, -1 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_sub_i32 s6, s6, s4 -; GFX8-NEXT: s_min_i32 s7, s0, -1 -; GFX8-NEXT: s_sub_i32 s7, s7, s5 -; GFX8-NEXT: s_max_i32 s2, s6, s2 -; GFX8-NEXT: s_min_i32 s2, s2, s7 +; GFX8-NEXT: s_max_i32 s4, s0, -1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_min_i32 s5, s0, -1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_max_i32 s2, s4, s2 +; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, s4 +; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff ; GFX8-NEXT: s_min_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -1394,26 +1370,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s6, -2 -; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_brev_b32 s7, 1 -; GFX6-NEXT: s_sub_i32 s8, s8, s6 -; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, s7 -; GFX6-NEXT: s_max_i32 s3, s8, s3 -; GFX6-NEXT: s_min_i32 s3, s3, s9 +; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX6-NEXT: s_min_i32 s7, s0, -1 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX6-NEXT: s_max_i32 s3, s6, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s7 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_min_i32 s6, s1, -1 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s4 -; GFX6-NEXT: s_min_i32 s3, s3, s8 +; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s5 ; GFX6-NEXT: s_min_i32 s3, s3, s4 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -1421,26 +1395,24 @@ ; ; GFX8-LABEL: s_ssubsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s6, -2 -; GFX8-NEXT: s_max_i32 s8, s0, -1 -; GFX8-NEXT: s_brev_b32 s7, 1 -; GFX8-NEXT: s_sub_i32 s8, s8, s6 -; GFX8-NEXT: s_min_i32 s9, s0, -1 -; GFX8-NEXT: s_sub_i32 s9, s9, s7 -; GFX8-NEXT: s_max_i32 s3, s8, s3 -; GFX8-NEXT: s_min_i32 s3, s3, s9 +; GFX8-NEXT: s_max_i32 s6, s0, -1 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX8-NEXT: s_min_i32 s7, s0, -1 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX8-NEXT: s_max_i32 s3, s6, s3 +; GFX8-NEXT: s_min_i32 s3, s3, s7 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 -; GFX8-NEXT: s_min_i32 s8, s1, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, s7 +; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s1, -1 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s4 -; GFX8-NEXT: s_min_i32 s3, s3, s8 +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX8-NEXT: s_min_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s7 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 @@ -1568,33 +1540,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s4, s10, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s11 +; GFX6-NEXT: s_max_i32 s8, s0, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s5 -; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX6-NEXT: s_min_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s6 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX6-NEXT: s_min_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s7 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 @@ -1602,33 +1572,31 @@ ; ; GFX8-LABEL: s_ssubsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_max_i32 s10, s0, -1 -; GFX8-NEXT: s_brev_b32 s9, 1 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 -; GFX8-NEXT: s_min_i32 s11, s0, -1 -; GFX8-NEXT: s_sub_i32 s11, s11, s9 -; GFX8-NEXT: s_max_i32 s4, s10, s4 -; GFX8-NEXT: s_min_i32 s4, s4, s11 +; GFX8-NEXT: s_max_i32 s8, s0, -1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX8-NEXT: s_min_i32 s9, s0, -1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX8-NEXT: s_max_i32 s4, s8, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 -; GFX8-NEXT: s_min_i32 s10, s1, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_min_i32 s8, s1, -1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s10 +; GFX8-NEXT: s_min_i32 s4, s4, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX8-NEXT: s_min_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s9 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX8-NEXT: s_min_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s9 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s7 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 @@ -1694,17 +1662,16 @@ ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1738,17 +1705,16 @@ ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -1781,40 +1747,38 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_max_i32 s12, s0, -1 -; GFX6-NEXT: s_brev_b32 s11, 1 -; GFX6-NEXT: s_sub_i32 s12, s12, s10 -; GFX6-NEXT: s_min_i32 s13, s0, -1 -; GFX6-NEXT: s_sub_i32 s13, s13, s11 -; GFX6-NEXT: s_max_i32 s5, s12, s5 -; GFX6-NEXT: s_min_i32 s5, s5, s13 +; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX6-NEXT: s_min_i32 s11, s0, -1 +; GFX6-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX6-NEXT: s_max_i32 s5, s10, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 -; GFX6-NEXT: s_min_i32 s12, s1, -1 -; GFX6-NEXT: s_sub_i32 s12, s12, s11 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s10, s1, -1 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s6 -; GFX6-NEXT: s_min_i32 s5, s5, s12 +; GFX6-NEXT: s_min_i32 s5, s5, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s8 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s4, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s9 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s4, s4, s5 @@ -1822,40 +1786,38 @@ ; ; GFX8-LABEL: s_ssubsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_max_i32 s12, s0, -1 -; GFX8-NEXT: s_brev_b32 s11, 1 -; GFX8-NEXT: s_sub_i32 s12, s12, s10 -; GFX8-NEXT: s_min_i32 s13, s0, -1 -; GFX8-NEXT: s_sub_i32 s13, s13, s11 -; GFX8-NEXT: s_max_i32 s5, s12, s5 -; GFX8-NEXT: s_min_i32 s5, s5, s13 +; GFX8-NEXT: s_max_i32 s10, s0, -1 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX8-NEXT: s_min_i32 s11, s0, -1 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX8-NEXT: s_max_i32 s5, s10, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s11 ; GFX8-NEXT: s_sub_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_min_i32 s12, s1, -1 -; GFX8-NEXT: s_sub_i32 s12, s12, s11 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_min_i32 s10, s1, -1 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s6 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s2, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s3, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s8 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s4, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s9 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s4, s4, s5 @@ -2197,117 +2159,115 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s32, -2 -; GFX6-NEXT: s_max_i32 s34, s0, -1 -; GFX6-NEXT: s_brev_b32 s33, 1 -; GFX6-NEXT: s_sub_i32 s34, s34, s32 -; GFX6-NEXT: s_min_i32 s35, s0, -1 -; GFX6-NEXT: s_sub_i32 s35, s35, s33 -; GFX6-NEXT: s_max_i32 s16, s34, s16 -; GFX6-NEXT: s_min_i32 s16, s16, s35 +; GFX6-NEXT: s_max_i32 s32, s0, -1 +; GFX6-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX6-NEXT: s_min_i32 s33, s0, -1 +; GFX6-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX6-NEXT: s_max_i32 s16, s32, s16 +; GFX6-NEXT: s_min_i32 s16, s16, s33 ; GFX6-NEXT: s_sub_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 -; GFX6-NEXT: s_min_i32 s34, s1, -1 -; GFX6-NEXT: s_sub_i32 s34, s34, s33 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_min_i32 s32, s1, -1 +; GFX6-NEXT: s_sub_i32 s32, s32, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s17 -; GFX6-NEXT: s_min_i32 s16, s16, s34 +; GFX6-NEXT: s_min_i32 s16, s16, s32 ; GFX6-NEXT: s_sub_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s2, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s18 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s3, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s19 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s4, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s20 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s5, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s21 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s6, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s22 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s7, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s23 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s8, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s24 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s9, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s25 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s10, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s26 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s11, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s27 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s12, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s28 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s13, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s29 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s14, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s30 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s15, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s31 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s15, s15, s16 @@ -2315,117 +2275,115 @@ ; ; GFX8-LABEL: s_ssubsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s32, -2 -; GFX8-NEXT: s_max_i32 s34, s0, -1 -; GFX8-NEXT: s_brev_b32 s33, 1 -; GFX8-NEXT: s_sub_i32 s34, s34, s32 -; GFX8-NEXT: s_min_i32 s35, s0, -1 -; GFX8-NEXT: s_sub_i32 s35, s35, s33 -; GFX8-NEXT: s_max_i32 s16, s34, s16 -; GFX8-NEXT: s_min_i32 s16, s16, s35 +; GFX8-NEXT: s_max_i32 s32, s0, -1 +; GFX8-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX8-NEXT: s_min_i32 s33, s0, -1 +; GFX8-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX8-NEXT: s_max_i32 s16, s32, s16 +; GFX8-NEXT: s_min_i32 s16, s16, s33 ; GFX8-NEXT: s_sub_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 -; GFX8-NEXT: s_min_i32 s34, s1, -1 -; GFX8-NEXT: s_sub_i32 s34, s34, s33 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_min_i32 s32, s1, -1 +; GFX8-NEXT: s_sub_i32 s32, s32, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s17 -; GFX8-NEXT: s_min_i32 s16, s16, s34 +; GFX8-NEXT: s_min_i32 s16, s16, s32 ; GFX8-NEXT: s_sub_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s2, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s18 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s3, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s19 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s4, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s20 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s5, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s21 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s6, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s22 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s7, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s23 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s8, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s24 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s9, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s25 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s10, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s26 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s11, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s27 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s12, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s28 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s13, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s29 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s14, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s30 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s15, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s31 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s15, s15, s16 @@ -2767,60 +2725,55 @@ ; GFX6-LABEL: s_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s2, s6, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s2, s4, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, -1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: s_max_i32 s8, s6, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, -1 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: s_min_i32 s6, s6, s7 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s6, s6, s5 -; GFX8-NEXT: s_max_i32 s1, s8, s1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s6, s1, s7 -; GFX8-NEXT: s_sub_i32 s4, s6, s4 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_max_i32 s4, s1, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 @@ -2853,22 +2806,20 @@ ; GFX6-LABEL: ssubsat_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_max_i32 s4, s0, -1 +; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: s_sub_i32 s4, s4, s2 -; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s3 -; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_min_i32_e32 v0, s5, v0 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX6-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -2883,25 +2834,23 @@ ; ; GFX8-LABEL: ssubsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, -1 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: s_sub_i32 s6, s6, s2 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, -1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_sub_i32 s4, s4, s3 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v0 -; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_sub_i32 s2, s6, s2 -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_sub_i32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 +; GFX8-NEXT: v_min_i16_e32 v1, s2, v1 +; GFX8-NEXT: s_sext_i32_i16 s2, s1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 +; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3123,97 +3072,92 @@ ; GFX6-LABEL: s_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s4, s10, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, -1 -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: s_max_i32 s12, s10, s11 -; GFX8-NEXT: s_sub_i32 s12, s12, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, -1 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_movk_i32 s9, 0x8000 -; GFX8-NEXT: s_min_i32 s10, s10, s11 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s10, s10, s9 -; GFX8-NEXT: s_max_i32 s2, s12, s2 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_max_i32 s2, s10, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s10 +; GFX8-NEXT: s_min_i32 s2, s2, s8 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s10, s2, s11 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_max_i32 s8, s2, s9 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s9 -; GFX8-NEXT: s_max_i32 s6, s10, s6 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_max_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s2, s6, s2 ; GFX8-NEXT: s_sub_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s6, s4, s11 -; GFX8-NEXT: s_sub_i32 s6, s6, s8 +; GFX8-NEXT: s_max_i32 s6, s4, s9 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s11 +; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3221,12 +3165,12 @@ ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_max_i32 s4, s3, s11 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 -; GFX8-NEXT: s_min_i32 s3, s3, s11 +; GFX8-NEXT: s_max_i32 s4, s3, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s9 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3440,121 +3384,116 @@ ; GFX6-LABEL: s_ssubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s12, -2 -; GFX6-NEXT: s_max_i32 s14, s0, -1 +; GFX6-NEXT: s_max_i32 s12, s0, -1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_brev_b32 s13, 1 -; GFX6-NEXT: s_sub_i32 s14, s14, s12 -; GFX6-NEXT: s_min_i32 s15, s0, -1 -; GFX6-NEXT: s_sub_i32 s15, s15, s13 -; GFX6-NEXT: s_max_i32 s6, s14, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s15 +; GFX6-NEXT: s_sub_i32 s12, s12, 0x7fffffff +; GFX6-NEXT: s_min_i32 s13, s0, -1 +; GFX6-NEXT: s_sub_i32 s13, s13, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s12, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s13 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 -; GFX6-NEXT: s_min_i32 s14, s1, -1 -; GFX6-NEXT: s_sub_i32 s14, s14, s13 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_min_i32 s12, s1, -1 +; GFX6-NEXT: s_sub_i32 s12, s12, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s14 +; GFX6-NEXT: s_min_i32 s6, s6, s12 ; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s2, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s3, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s4, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s5, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 -; GFX6-NEXT: s_max_i32 s6, s7, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, -1 -; GFX8-NEXT: s_movk_i32 s12, 0x7fff -; GFX8-NEXT: s_max_i32 s16, s14, s15 -; GFX8-NEXT: s_sub_i32 s16, s16, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_sext_i32_i16 s13, -1 +; GFX8-NEXT: s_max_i32 s14, s12, s13 +; GFX8-NEXT: s_sub_i32 s14, s14, 0x7fff ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_movk_i32 s13, 0x8000 -; GFX8-NEXT: s_min_i32 s14, s14, s15 -; GFX8-NEXT: s_sext_i32_i16 s16, s16 +; GFX8-NEXT: s_min_i32 s12, s12, s13 +; GFX8-NEXT: s_sext_i32_i16 s14, s14 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s14, s14, s13 -; GFX8-NEXT: s_max_i32 s3, s16, s3 +; GFX8-NEXT: s_sub_i32 s12, s12, 0xffff8000 +; GFX8-NEXT: s_max_i32 s3, s14, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s14 +; GFX8-NEXT: s_min_i32 s3, s3, s12 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s14, s3, s15 -; GFX8-NEXT: s_sub_i32 s14, s14, s12 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_max_i32 s12, s3, s13 +; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s3, s3, s13 -; GFX8-NEXT: s_max_i32 s9, s14, s9 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_max_i32 s9, s12, s9 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s3, s9, s3 ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_max_i32 s9, s6, s15 -; GFX8-NEXT: s_sub_i32 s9, s9, s12 +; GFX8-NEXT: s_max_i32 s9, s6, s13 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_min_i32 s6, s6, s15 +; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3562,25 +3501,25 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s7 -; GFX8-NEXT: s_max_i32 s6, s4, s15 -; GFX8-NEXT: s_sub_i32 s6, s6, s12 -; GFX8-NEXT: s_min_i32 s4, s4, s15 +; GFX8-NEXT: s_max_i32 s6, s4, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s4, s13 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s2 ; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_max_i32 s7, s6, s15 -; GFX8-NEXT: s_sub_i32 s7, s7, s12 +; GFX8-NEXT: s_max_i32 s7, s6, s13 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_min_i32 s6, s6, s15 +; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3588,12 +3527,12 @@ ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s15 -; GFX8-NEXT: s_sub_i32 s6, s6, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s15 +; GFX8-NEXT: s_max_i32 s6, s5, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_sub_i32 s5, s5, s13 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3847,145 +3786,140 @@ ; GFX6-LABEL: s_ssubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s16, -2 -; GFX6-NEXT: s_max_i32 s18, s0, -1 +; GFX6-NEXT: s_max_i32 s16, s0, -1 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_brev_b32 s17, 1 -; GFX6-NEXT: s_sub_i32 s18, s18, s16 -; GFX6-NEXT: s_min_i32 s19, s0, -1 -; GFX6-NEXT: s_sub_i32 s19, s19, s17 -; GFX6-NEXT: s_max_i32 s8, s18, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s19 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_min_i32 s17, s0, -1 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_max_i32 s8, s16, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 -; GFX6-NEXT: s_min_i32 s18, s1, -1 -; GFX6-NEXT: s_sub_i32 s18, s18, s17 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_min_i32 s16, s1, -1 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s18 +; GFX6-NEXT: s_min_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s2, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s3, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s4, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s5, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s6, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s7, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s7, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 -; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_max_i32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, -1 -; GFX8-NEXT: s_movk_i32 s16, 0x7fff -; GFX8-NEXT: s_max_i32 s20, s18, s19 -; GFX8-NEXT: s_sub_i32 s20, s20, s16 +; GFX8-NEXT: s_sext_i32_i16 s16, s0 +; GFX8-NEXT: s_sext_i32_i16 s17, -1 +; GFX8-NEXT: s_max_i32 s18, s16, s17 +; GFX8-NEXT: s_sub_i32 s18, s18, 0x7fff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_movk_i32 s17, 0x8000 -; GFX8-NEXT: s_min_i32 s18, s18, s19 -; GFX8-NEXT: s_sext_i32_i16 s20, s20 +; GFX8-NEXT: s_min_i32 s16, s16, s17 +; GFX8-NEXT: s_sext_i32_i16 s18, s18 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s18, s18, s17 -; GFX8-NEXT: s_max_i32 s4, s20, s4 +; GFX8-NEXT: s_sub_i32 s16, s16, 0xffff8000 +; GFX8-NEXT: s_max_i32 s4, s18, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s18 +; GFX8-NEXT: s_min_i32 s4, s4, s16 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s18, s4, s19 -; GFX8-NEXT: s_sub_i32 s18, s18, s16 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_max_i32 s16, s4, s17 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s4, s4, s17 -; GFX8-NEXT: s_max_i32 s12, s18, s12 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s12, s16, s12 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s4, s12, s4 ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s8, s1 -; GFX8-NEXT: s_max_i32 s12, s8, s19 -; GFX8-NEXT: s_sub_i32 s12, s12, s16 +; GFX8-NEXT: s_max_i32 s12, s8, s17 +; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3993,25 +3927,25 @@ ; GFX8-NEXT: s_min_i32 s5, s5, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s9 -; GFX8-NEXT: s_max_i32 s8, s5, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_min_i32 s5, s5, s19 +; GFX8-NEXT: s_max_i32 s8, s5, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s5, s17 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s12 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s5, s8, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s2 ; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_max_i32 s9, s8, s19 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 +; GFX8-NEXT: s_max_i32 s9, s8, s17 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4019,24 +3953,24 @@ ; GFX8-NEXT: s_min_i32 s6, s6, s8 ; GFX8-NEXT: s_sub_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s10 -; GFX8-NEXT: s_max_i32 s8, s6, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_min_i32 s6, s6, s19 +; GFX8-NEXT: s_max_i32 s8, s6, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s6, s6, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s14 -; GFX8-NEXT: s_sub_i32 s6, s6, s17 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_min_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s3 -; GFX8-NEXT: s_max_i32 s9, s8, s19 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 +; GFX8-NEXT: s_max_i32 s9, s8, s17 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4044,15 +3978,15 @@ ; GFX8-NEXT: s_min_i32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s3, s3, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_max_i32 s8, s7, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 +; GFX8-NEXT: s_max_i32 s8, s7, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_min_i32 s7, s7, s19 +; GFX8-NEXT: s_min_i32 s7, s7, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s15 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_sub_i32 s7, s7, s17 +; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 @@ -4539,13 +4473,12 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 +; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -4557,10 +4490,10 @@ ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 @@ -4582,13 +4515,12 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_mov_b32 s10, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 +; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4600,10 +4532,10 @@ ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4625,13 +4557,12 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_mov_b32 s10, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 +; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4643,10 +4574,10 @@ ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -4665,15 +4596,14 @@ ; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_mov_b32 s10, 0 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_xor_b32 s8, s4, s1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_sub_u32 s4, s2, s6 ; GFX10-NEXT: s_subb_u32 s5, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -4683,9 +4613,9 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 @@ -5481,10 +5411,9 @@ ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_brev_b32 s8, 1 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, s8 +; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 @@ -5525,7 +5454,7 @@ ; GFX6-NEXT: s_addc_u32 s5, s4, 0 ; GFX6-NEXT: s_addc_u32 s6, s4, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s4, s8 +; GFX6-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5582,10 +5511,9 @@ ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_brev_b32 s8, 1 ; GFX8-NEXT: s_addc_u32 s2, s0, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, s8 +; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 @@ -5632,7 +5560,7 @@ ; GFX8-NEXT: s_addc_u32 s5, s4, 0 ; GFX8-NEXT: s_addc_u32 s6, s4, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s4, s8 +; GFX8-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5689,10 +5617,9 @@ ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_brev_b32 s8, 1 ; GFX9-NEXT: s_addc_u32 s2, s0, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, s8 +; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 @@ -5739,7 +5666,7 @@ ; GFX9-NEXT: s_addc_u32 s5, s4, 0 ; GFX9-NEXT: s_addc_u32 s6, s4, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s4, s8 +; GFX9-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -5770,7 +5697,6 @@ ; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_subb_u32 s19, s3, s11 -; GFX10-NEXT: s_brev_b32 s21, 1 ; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -5792,7 +5718,7 @@ ; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s21 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: s_sub_u32 s8, s4, s12 ; GFX10-NEXT: s_subb_u32 s9, s5, s13 ; GFX10-NEXT: s_subb_u32 s10, s6, s14 @@ -5838,7 +5764,7 @@ ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, s10 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s21 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -113,44 +113,43 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s1, 0x80008 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s4, s1 +; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s2, s4, 16 +; GFX7-NEXT: s_lshr_b32 s1, s4, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX7-NEXT: s_lshr_b32 s0, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_bfe_u32 s2, s5, s1 +; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: s_lshr_b32 s2, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_bfe_u32 s2, s6, s1 +; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX7-NEXT: s_lshr_b32 s2, s6, 24 +; GFX7-NEXT: s_lshr_b32 s1, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX7-NEXT: s_bfe_u32 s1, s7, s1 +; GFX7-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: s_lshr_b32 s0, s7, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -98,33 +98,32 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s1, 0x80008 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s4, s1 +; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s2, s4, 16 +; GFX7-NEXT: s_lshr_b32 s1, s4, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX7-NEXT: s_lshr_b32 s0, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_bfe_u32 s2, s5, s1 +; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: s_lshr_b32 s2, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_bfe_u32 s1, s6, s1 +; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -967,7 +967,7 @@ ; GFX7-LABEL: usubo_i16_sv: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s1, v0 @@ -980,7 +980,7 @@ ; GFX8-LABEL: usubo_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v1, s1, v0 @@ -992,8 +992,7 @@ ; ; GFX9-LABEL: usubo_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_sub_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll @@ -162,23 +162,21 @@ define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) { ; GFX7-LABEL: s_trunc_v4i32_to_v4i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_trunc_v4i32_to_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog %trunc = trunc <4 x i32> %src to <4 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -242,12 +242,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -305,17 +304,16 @@ ; GFX9-LABEL: s_uaddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp @@ -332,15 +330,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -466,35 +463,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_add_u16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -590,27 +585,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp @@ -637,39 +631,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -910,9 +910,8 @@ ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -926,9 +925,9 @@ ; GFX10-NEXT: s_sub_i32 s1, 0, s13 ; GFX10-NEXT: s_sub_i32 s2, 0, s14 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2269,24 +2268,24 @@ ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_lshr_b32 s8, s1, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 +; GFX8-NEXT: s_and_b32 s2, s1, 0xffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX8-NEXT: s_sub_i32 s1, 0, s2 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s0, 16 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX8-NEXT: s_sub_i32 s1, 0, s8 +; GFX8-NEXT: s_sub_i32 s1, 0, s3 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -2294,34 +2293,34 @@ ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s8 -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 +; GFX8-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s8, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -2335,54 +2334,53 @@ ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_and_b32 s3, s1, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s3, 0, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s9, s0, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2391,43 +2389,42 @@ ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: global_store_dword v2, v1, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -2445,14 +2442,13 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v1, v4, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] @@ -2586,9 +2582,8 @@ ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s8, 0x7ffffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s7, s7, s8 +; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2596,7 +2591,8 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, s8 +; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff +; GFX8-NEXT: s_mov_b32 s5, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -2614,11 +2610,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX8-NEXT: v_and_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX8-NEXT: v_and_b32_e32 v2, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2626,49 +2622,48 @@ ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 -; GFX9-NEXT: s_and_b32 s8, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s7, s1, s6 -; GFX10-NEXT: s_and_b32 s0, s0, s6 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2676,22 +2671,22 @@ ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -272,13 +272,13 @@ ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1079,9 +1079,9 @@ ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc @@ -1099,67 +1099,67 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s8 -; GISEL-NEXT: s_sub_u32 s6, 0, s8 -; GISEL-NEXT: v_madmk_f32 v5, v4, 0x4f800000, v6 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, s8 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GISEL-NEXT: v_mov_b32_e32 v5, s4 -; GISEL-NEXT: v_mov_b32_e32 v4, s5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: s_sub_u32 s9, 0, s8 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_madmk_f32 v9, v7, 0x4f800000, v8 +; GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GISEL-NEXT: s_subb_u32 s10, 0, 0 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; GISEL-NEXT: v_mov_b32_e32 v10, s4 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_trunc_f32_e32 v10, v10 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10 ; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6 -; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, s10, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, s9, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 ; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 @@ -1167,8 +1167,8 @@ ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 @@ -1183,40 +1183,40 @@ ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7 ; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, s10, v8 +; GISEL-NEXT: v_mul_hi_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, s9, v10 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 ; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v15 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -1224,125 +1224,126 @@ ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15 +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_mov_b32_e32 v18, s12 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v13, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v7 ; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, s8, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, s8, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v7, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6 -; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, s8, v2 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GISEL-NEXT: v_subrev_i32_e32 v10, vcc, s8, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v8, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc -; GISEL-NEXT: v_subrev_i32_e32 v14, vcc, s8, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v11, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -236,12 +236,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -297,17 +296,16 @@ ; GFX9-LABEL: s_usubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp @@ -324,15 +322,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -454,35 +451,33 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_sub_u16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -574,27 +569,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp @@ -621,39 +615,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -25,12 +25,11 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v2i16_one_use: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_xor_b32 s0, s0, s1 ; GFX7-NEXT: s_xor_b32 s0, s0, -1 @@ -42,10 +41,10 @@ ; GFX8-NEXT: s_xor_b32 s0, s0, s1 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -117,18 +116,17 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v4i16_one_use: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshl_b32 s2, s5, 16 -; GFX7-NEXT: s_and_b32 s3, s4, s8 +; GFX7-NEXT: s_and_b32 s3, s4, 0xffff ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s3, s7, 16 -; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_and_b32 s4, s6, 0xffff ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_mov_b32 s4, -1 ; GFX7-NEXT: s_mov_b32 s5, s4 @@ -142,16 +140,16 @@ ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_and_b32 s6, s1, s4 +; GFX8-NEXT: s_and_b32 s6, s1, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -205,7 +205,8 @@ ; GFX9PLUS: global_load_dword [[B:v[0-9]+]] ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]] +; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] ; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9PLUS: buffer_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -150,7 +150,7 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 @@ -232,7 +232,7 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2526,12 +2526,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s6, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_and_b32 s9, s4, s8 +; GFX6-NEXT: s_and_b32 s8, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_and_b32 s9, s4, 0xffff ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -2546,11 +2546,11 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -2566,7 +2566,7 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 @@ -2576,8 +2576,8 @@ ; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2586,60 +2586,59 @@ ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s1, s7, s0 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GFX9-NEXT: v_mad_f32 v4, -v1, v3, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: s_and_b32 s0, s5, s0 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: s_and_b32 s2, s5, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: s_lshr_b32 s2, s7, 16 +; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_lshr_b32 s0, s7, 16 -; GFX9-NEXT: v_mad_f32 v4, -v1, v5, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 +; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: s_lshr_b32 s2, s5, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GFX9-NEXT: v_mad_f32 v4, -v4, v6, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out @@ -2743,14 +2742,15 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s6, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s8, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16 -; GFX6-NEXT: s_and_b32 s9, s4, s8 +; GFX6-NEXT: s_and_b32 s9, s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -2772,10 +2772,10 @@ ; GFX6-NEXT: v_mad_f32 v1, -v1, v5, v6 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 -; GFX6-NEXT: s_and_b32 s6, s7, s8 +; GFX6-NEXT: s_and_b32 s6, s7, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: s_and_b32 s6, s5, s8 +; GFX6-NEXT: s_and_b32 s6, s5, 0xffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 @@ -2801,7 +2801,6 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -2815,68 +2814,67 @@ ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s8, s7, s0 -; GFX9-NEXT: v_mad_f32 v4, -v1, v3, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s5, 0xffff +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: v_mad_f32 v4, -v3, v5, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 +; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, s1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GFX9-NEXT: v_mad_f32 v4, -v4, v6, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s0, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out @@ -3831,45 +3829,44 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s6, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_and_b32 s9, s4, s8 +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: s_and_b32 s8, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s7, s8 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3878,47 +3875,46 @@ ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v2, -v4, v1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s1, s7, s0 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GFX9-NEXT: v_mad_f32 v4, -v2, v3, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_short v0, v3, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -4003,9 +3999,9 @@ ; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, s8 +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 @@ -4025,13 +4021,13 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4057,51 +4053,51 @@ ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s7, s7, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s2, s5, s2 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX9-NEXT: v_sub_u32_e32 v0, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4611,18 +4607,18 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s9, s0, s3 +; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_and_b32 s8, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 @@ -4671,8 +4667,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s2, s6 -; GFX9-NEXT: s_and_b32 s7, s0, s6 +; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff +; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f @@ -4800,45 +4796,45 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s10, s0, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX6-NEXT: s_and_b32 s9, s2, s3 +; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX6-NEXT: s_and_b32 s10, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 +; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc @@ -4868,8 +4864,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s2, s6 -; GFX9-NEXT: s_and_b32 s8, s0, s6 +; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff +; GFX9-NEXT: s_and_b32 s8, s0, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 ; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f @@ -5675,24 +5671,23 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 ; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s3, s2, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_lshl_b32 s2, s2, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s3 +; GFX6-NEXT: s_sub_i32 s0, 0, s2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX6-NEXT: s_sub_i32 s0, 0, s2 +; GFX6-NEXT: s_sub_i32 s0, 0, s3 ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -5700,25 +5695,25 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm @@ -5726,19 +5721,18 @@ ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s2, 0x1000 +; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, s2, s6 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, s2, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 @@ -5747,26 +5741,26 @@ ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s7 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s6, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 @@ -5913,12 +5907,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s6, 0xfff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s4, s6 -; GFX6-NEXT: s_and_b32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s5, s5, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -5928,13 +5921,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xfff +; GFX9-NEXT: s_and_b32 s1, s3, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, @@ -6011,27 +6003,26 @@ ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s2, 0x1000 +; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s6, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s7, s2, s7 +; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s7 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -6041,12 +6032,12 @@ ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -6059,22 +6050,21 @@ ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s2, 0x1000 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, s2, s7 -; GFX9-NEXT: s_lshl_b32 s2, s2, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_mov_b32 s6, 0x4f7ffffe -; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, s6, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s6, 0, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -6084,22 +6074,23 @@ ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6478,140 +6469,138 @@ ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 -; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s3, s2, s6 -; GFX6-NEXT: s_ashr_i32 s6, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s6 -; GFX6-NEXT: s_xor_b32 s3, s3, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_lshl_b32 s0, s2, s7 -; GFX6-NEXT: s_sub_i32 s7, 0, s3 -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s1, s0, 31 +; GFX6-NEXT: s_add_i32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s2, s0, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s11 +; GFX6-NEXT: s_sub_i32 s11, 0, s2 +; GFX6-NEXT: s_ashr_i32 s10, s0, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s1, s4, 31 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX6-NEXT: s_add_i32 s0, s0, s10 +; GFX6-NEXT: s_ashr_i32 s3, s8, 31 +; GFX6-NEXT: s_add_i32 s8, s8, s3 +; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 -; GFX6-NEXT: s_xor_b32 s7, s0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 -; GFX6-NEXT: s_add_i32 s0, s4, s1 +; GFX6-NEXT: s_xor_b32 s12, s3, s1 +; GFX6-NEXT: v_mul_lo_u32 v1, s11, v0 +; GFX6-NEXT: s_xor_b32 s11, s0, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX6-NEXT: s_xor_b32 s0, s8, s3 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s3, 0, s11 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_xor_b32 s4, s1, s6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, s13, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 -; GFX6-NEXT: s_ashr_i32 s0, s5, 31 -; GFX6-NEXT: s_add_i32 s1, s5, s0 +; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX6-NEXT: s_ashr_i32 s0, s9, 31 +; GFX6-NEXT: s_add_i32 s1, s9, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: s_xor_b32 s2, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s7 +; GFX6-NEXT: s_xor_b32 s2, s0, s10 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s11 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s11, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x1000 -; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe +; GFX9-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s0, s6 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s1, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s8 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s9, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s9, 0, s1 -; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_sub_i32 s10, 0, s0 +; GFX9-NEXT: v_mul_f32_e32 v0, s11, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_ashr_i32 s7, s4, 31 ; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, s11, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: s_sub_i32 s10, 0, s0 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_sub_i32 s10, 0, s6 +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s8 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: s_xor_b32 s6, s7, s6 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX9-NEXT: s_xor_b32 s5, s5, s8 +; GFX9-NEXT: s_xor_b32 s1, s7, s1 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: s_xor_b32 s4, s5, s9 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 -; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s1, v4 +; GFX9-NEXT: v_subrev_u32_e32 v5, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s1, s9, s8 -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: s_xor_b32 s0, s8, s9 +; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6806,19 +6795,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s6, 0xf000 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s7, s4, 31 -; GFX6-NEXT: s_lshr_b32 s7, s7, 20 -; GFX6-NEXT: s_add_i32 s7, s4, s7 -; GFX6-NEXT: s_and_b32 s7, s7, s6 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_add_i32 s6, s4, s6 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_lshr_b32 s7, s7, 20 -; GFX6-NEXT: s_add_i32 s7, s5, s7 -; GFX6-NEXT: s_and_b32 s6, s7, s6 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_lshr_b32 s6, s7, 20 +; GFX6-NEXT: s_add_i32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 @@ -6829,21 +6817,20 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s2, 31 +; GFX9-NEXT: s_ashr_i32 s0, s2, 31 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 ; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_ashr_i32 s6, s3, 31 -; GFX9-NEXT: s_and_b32 s1, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 20 -; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NEXT: s_sub_i32 s0, s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_i32 s0, s2, s0 +; GFX9-NEXT: s_add_i32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s2, s0 +; GFX9-NEXT: s_sub_i32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, @@ -6936,39 +6923,38 @@ ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s8, 0x1000 -; GFX6-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s9, 0x4f7ffffe ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, s8, s6 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_xor_b32 s6, s2, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s7, s8, s7 -; GFX6-NEXT: s_ashr_i32 s9, s7, 31 -; GFX6-NEXT: s_add_i32 s7, s7, s9 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX6-NEXT: s_ashr_i32 s7, s2, 31 +; GFX6-NEXT: s_add_i32 s2, s2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s10, 0, s6 -; GFX6-NEXT: s_xor_b32 s7, s7, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 -; GFX6-NEXT: v_mul_f32_e32 v0, s11, v0 +; GFX6-NEXT: s_xor_b32 s7, s2, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_mul_f32_e32 v0, s9, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_add_i32 s4, s4, s8 -; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s8 -; GFX6-NEXT: s_sub_i32 s10, 0, s7 -; GFX6-NEXT: s_ashr_i32 s9, s5, 31 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s11, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX6-NEXT: s_add_i32 s2, s4, s8 +; GFX6-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX6-NEXT: s_xor_b32 s4, s2, s8 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: s_sub_i32 s2, 0, s7 +; GFX6-NEXT: s_ashr_i32 s9, s5, 31 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -7000,64 +6986,63 @@ ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x1000 -; GFX9-NEXT: s_mov_b32 s8, 0x4f7ffffe -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s0, s6 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s1, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_xor_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s6 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_ashr_i32 s7, s2, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s7 +; GFX9-NEXT: s_xor_b32 s2, s2, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s7, 0, s1 +; GFX9-NEXT: s_sub_i32 s8, 0, s3 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v0 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_xor_b32 s5, s5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -7079,36 +7064,35 @@ ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s4, 0xfee0 ; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_movk_i32 s8, 0x11f +; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 -; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7124,7 +7108,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -7133,7 +7117,7 @@ ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7149,13 +7133,13 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v5, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -7201,14 +7185,13 @@ ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xfee0 ; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 @@ -7217,16 +7200,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7240,17 +7223,17 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7265,32 +7248,33 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x11f ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s3, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: s_movk_i32 s3, 0x11e ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 ; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 2, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] @@ -7305,10 +7289,10 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, i64 addrspace(1)* %out @@ -7454,9 +7438,9 @@ ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s6, 0xf001 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -7467,23 +7451,22 @@ ; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 ; GFX6-NEXT: s_movk_i32 s0, 0xfff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7497,7 +7480,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -7505,7 +7488,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7521,7 +7504,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 @@ -7563,7 +7546,6 @@ ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xf001 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -7575,75 +7557,76 @@ ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 ; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc ; GFX9-NEXT: s_movk_i32 s0, 0xffe ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc @@ -7652,16 +7635,16 @@ ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -7736,38 +7719,37 @@ ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 ; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s4 ; GFX6-NEXT: s_movk_i32 s4, 0x11f +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_movk_i32 s5, 0x11e -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7783,7 +7765,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -7791,7 +7773,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7807,7 +7789,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 @@ -7817,7 +7799,7 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 @@ -7857,35 +7839,34 @@ ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xfee0 ; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s8, 0x11f +; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 -; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 +; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7898,16 +7879,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7924,46 +7905,47 @@ ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: s_movk_i32 s6, 0x11e -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v5 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, i64 addrspace(1)* %out @@ -8066,13 +8048,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s5, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_and_b32 s4, s4, s5 -; GFX6-NEXT: s_and_b32 s5, s6, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s5, s6, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 @@ -8083,14 +8064,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, @@ -8174,8 +8154,8 @@ ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -8192,12 +8172,12 @@ ; GFX6-NEXT: s_addc_u32 s3, s3, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 @@ -8205,7 +8185,7 @@ ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -8213,23 +8193,22 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -8246,7 +8225,7 @@ ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 @@ -8291,7 +8270,6 @@ ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -8306,16 +8284,16 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -8328,18 +8306,18 @@ ; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -8349,32 +8327,33 @@ ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc ; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc @@ -8383,9 +8362,9 @@ ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -8395,7 +8374,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -8501,48 +8480,47 @@ ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 @@ -8594,7 +8572,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 @@ -8614,114 +8591,115 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -8842,10 +8820,9 @@ ; GFX6-NEXT: s_add_u32 s0, s2, s10 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 @@ -8856,36 +8833,36 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: s_movk_i32 s2, 0xfff ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 @@ -8933,95 +8910,95 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s8, 0xf001 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s8 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 -; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: s_add_u32 s6, s6, s4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s7, s7, s4 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s7, s8 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: s_movk_i32 s0, 0xfff +; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX9-NEXT: s_movk_i32 s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s6 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, s6 +; GFX9-NEXT: v_mul_lo_u32 v9, v0, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc ; GFX9-NEXT: s_movk_i32 s0, 0xffe ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 @@ -9038,14 +9015,14 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -9096,201 +9073,200 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: s_addc_u32 s1, s5, s16 -; GFX6-NEXT: v_mul_lo_u32 v0, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s10, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s5, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 -; GFX6-NEXT: v_mul_lo_u32 v5, s13, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s13 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v6, s[0:1], 2, v1 -; GFX6-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v8, s[0:1], 1, v1 -; GFX6-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v2, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: s_add_u32 s2, s2, s4 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v6, s5 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 -; GFX6-NEXT: v_mac_f32_e32 v9, s18, v10 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; GFX6-NEXT: v_rcp_f32_e32 v4, v9 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v4, s19, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s3 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v8 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: s_sub_u32 s0, 0, s2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 ; GFX6-NEXT: s_subb_u32 s1, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 ; GFX6-NEXT: s_ashr_i32 s12, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s15, v2 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s15, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: s_add_u32 s0, s6, s12 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: s_addc_u32 s1, s7, s12 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s7, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, s15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v1 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s3, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc ; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s2, v5 @@ -9301,30 +9277,30 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v3 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v3 -; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 +; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] -; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm @@ -9358,75 +9334,75 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_add_u32 s2, s4, s14 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s5, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v1 ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 @@ -9495,7 +9471,7 @@ ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 @@ -9518,7 +9494,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -9538,7 +9514,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s10, v4 @@ -9607,8 +9583,8 @@ ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -9626,42 +9602,41 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9669,7 +9644,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -9679,7 +9654,7 @@ ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 @@ -9722,7 +9697,6 @@ ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -9737,16 +9711,16 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9759,18 +9733,18 @@ ; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9781,7 +9755,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc @@ -9790,41 +9764,42 @@ ; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s3, v2 -; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s3, v2 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -9934,48 +9909,47 @@ ; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s13, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 @@ -10025,7 +9999,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -10050,89 +10023,90 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 @@ -10142,15 +10116,15 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -10173,24 +10147,23 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s8, 0xf000 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s9, s5, 31 -; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s4, s9 -; GFX6-NEXT: s_addc_u32 s10, s5, 0 -; GFX6-NEXT: s_and_b32 s9, s9, s8 -; GFX6-NEXT: s_sub_u32 s4, s4, s9 -; GFX6-NEXT: s_subb_u32 s5, s5, s10 -; GFX6-NEXT: s_ashr_i32 s9, s7, 31 -; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s6, s9 -; GFX6-NEXT: s_addc_u32 s10, s7, 0 -; GFX6-NEXT: s_and_b32 s8, s9, s8 +; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 +; GFX6-NEXT: s_add_u32 s8, s4, s8 +; GFX6-NEXT: s_addc_u32 s9, s5, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 +; GFX6-NEXT: s_sub_u32 s4, s4, s8 +; GFX6-NEXT: s_subb_u32 s5, s5, s9 +; GFX6-NEXT: s_ashr_i32 s8, s7, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 +; GFX6-NEXT: s_add_u32 s8, s6, s8 +; GFX6-NEXT: s_addc_u32 s9, s7, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 ; GFX6-NEXT: s_sub_u32 s6, s6, s8 -; GFX6-NEXT: s_subb_u32 s7, s7, s10 +; GFX6-NEXT: s_subb_u32 s7, s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 @@ -10202,26 +10175,25 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s5, 31 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_u32 s1, s4, s1 -; GFX9-NEXT: s_addc_u32 s8, s5, 0 -; GFX9-NEXT: s_and_b32 s1, s1, s0 -; GFX9-NEXT: s_sub_u32 s1, s4, s1 -; GFX9-NEXT: s_subb_u32 s4, s5, s8 -; GFX9-NEXT: s_ashr_i32 s5, s7, 31 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_u32 s5, s6, s5 -; GFX9-NEXT: s_addc_u32 s8, s7, 0 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s5, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s0, s4, s0 +; GFX9-NEXT: s_subb_u32 s1, s5, s1 +; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s4, s6, s4 +; GFX9-NEXT: s_subb_u32 s5, s7, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm @@ -10274,202 +10246,201 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: s_addc_u32 s1, s5, s12 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s5, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s17, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, s17 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 -; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v1 -; GFX6-NEXT: v_subbrev_u32_e64 v6, s[2:3], 0, v3, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v6 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 +; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 -; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v6 -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX6-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] +; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: s_add_u32 s4, s14, s2 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, s5 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s5, s15, s2 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s5 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v2 -; GFX6-NEXT: v_mac_f32_e32 v7, s18, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v1 -; GFX6-NEXT: v_rcp_f32_e32 v7, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v4, s19, v7 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_mac_f32_e32 v6, s18, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GFX6-NEXT: v_rcp_f32_e32 v6, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v3, s19, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: s_sub_u32 s0, 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 ; GFX6-NEXT: s_subb_u32 s1, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 ; GFX6-NEXT: s_ashr_i32 s14, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: s_add_u32 s0, s6, s14 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: s_addc_u32 s1, s7, s14 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s7, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, s12 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v1 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v3 +; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v2 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] @@ -10483,22 +10454,22 @@ ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v7, s7 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s14 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm @@ -10532,74 +10503,74 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_add_u32 s2, s4, s8 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_addc_u32 s3, s5, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s15, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s15, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s15, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_sub_u32_e32 v3, s15, v2 @@ -10667,7 +10638,7 @@ ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 @@ -10690,7 +10661,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -10710,7 +10681,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -63,9 +63,8 @@ ; Second use is a VGPR use of the constant. ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0: -; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 -; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687 +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x12d687 ; SI: buffer_store_dword [[VK]] define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 @@ -79,10 +78,9 @@ ; Second use is another SGPR use of the constant. ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1: -; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 -; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687 ; SI: s_add_i32 -; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, 0x12d687 ; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]] ; SI: buffer_store_dword [[VADD]] define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -3067,7 +3067,7 @@ ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3118,7 +3118,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3160,41 +3160,39 @@ ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -3214,7 +3212,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3223,31 +3221,29 @@ ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -3266,7 +3262,7 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3484,7 +3480,7 @@ ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3535,7 +3531,7 @@ ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3577,41 +3573,39 @@ ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -3631,7 +3625,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3640,31 +3634,29 @@ ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -3683,7 +3675,7 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -27,7 +27,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -42,11 +41,11 @@ ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 @@ -57,13 +56,13 @@ ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 @@ -81,7 +80,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v10, v2 @@ -175,7 +174,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -190,32 +188,32 @@ ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -224,7 +222,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 @@ -318,7 +316,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -333,11 +330,11 @@ ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v13, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v6, v2 @@ -348,13 +345,13 @@ ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v14, v12, v2 +; GFX9-NEXT: v_mul_hi_u32 v13, v12, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 @@ -372,7 +369,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v9, v0 @@ -462,7 +459,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -477,32 +473,32 @@ ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -511,7 +507,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v4 @@ -728,7 +724,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -743,11 +738,11 @@ ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 @@ -758,13 +753,13 @@ ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 @@ -782,7 +777,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2 @@ -896,7 +891,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -911,32 +905,32 @@ ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -945,7 +939,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: s_mul_i32 ; CHECK: s_sub_i32 -; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, 0xffff ; CHECK: s_add_i32 [[S2:s[0-9]+]], {{s[0-9]+}}, [[S1]] ; CHECK: s_or_b32 {{s[0-9]+}}, [[S2]], 0xc0 diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -226,7 +226,6 @@ ; GCN-LABEL: {{^}}load_sampler ; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 ; SI: s_nop ; GCN: s_load_dwordx8 ; GCN-NEXT: s_load_dwordx4 @@ -260,7 +259,6 @@ ; GCN-LABEL: {{^}}load_sampler_nouniform ; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 ; SI: s_nop ; GCN: s_load_dwordx8 ; GCN-NEXT: s_load_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1500,15 +1500,13 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1604,19 +1602,18 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1492,14 +1492,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1595,18 +1593,17 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1205,7 +1205,6 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v5, 9 -; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1232,8 +1231,8 @@ ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x900 +; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1250,7 +1249,6 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX10-NEXT: s_movk_i32 s0, 0x900 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1261,8 +1259,8 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_add_nc_u16 v1, v1, s0 -; GFX10-NEXT: v_add_nc_u16 v5, v2, s0 +; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900 +; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -71,21 +71,20 @@ ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v3, 0x8000 -; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v5, 1 -; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000 -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, 0x8000 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v6, 1 +; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -42,9 +42,8 @@ ; CI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2 ; GFX89: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x8 -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff -; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]] -; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]] +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7fff7fff +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], 0x7fff7fff ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -30,8 +30,8 @@ } ; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: s_and_b32 -; SI: s_and_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) @@ -40,10 +40,10 @@ } ; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: s_and_b32 -; SI: s_and_b32 -; SI: s_and_b32 -; SI: s_and_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -45,8 +45,8 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -59,10 +59,10 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -107,10 +107,9 @@ ; VI-LABEL: v_exp_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3dc5 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3dc5 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v0, s4, v0 +; VI-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 ; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -161,7 +160,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0x3dc5 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5 ; VI-NEXT: v_mul_f16_e32 v2, s4, v1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v4, s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -532,12 +532,11 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -585,12 +584,11 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -639,12 +637,11 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, s32 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -681,12 +678,11 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1387,14 +1383,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1447,15 +1442,14 @@ ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1472,14 +1466,13 @@ ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1524,15 +1517,15 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1574,15 +1567,15 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2277,14 +2270,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2338,15 +2330,14 @@ ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2363,14 +2354,13 @@ ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2415,15 +2405,15 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2467,15 +2457,15 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -970,7 +970,7 @@ ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. ; GCN-LABEL: {{^}}one_non_inline_constant: -; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 +; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]] define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { @@ -990,9 +990,8 @@ } ; GCN-LABEL: {{^}}two_non_inline_constant_multi_use: -; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 ; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000 -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]] +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]] define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -97,9 +97,8 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -69,10 +69,9 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v2f64: -; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs @@ -81,12 +80,11 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f64: -; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -84,9 +84,8 @@ ; R600: -PV ; FIXME: In this case two uses of the constant should be folded -; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -95,11 +94,10 @@ } ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -18,9 +18,8 @@ ; R600: -PV ; R600: -PV -; GCN: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1 -; GCN: s_xor_b32 -; GCN: s_xor_b32 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { %fneg = fsub <2 x float> , %in store <2 x float> %fneg, <2 x float> addrspace(1)* %out @@ -33,10 +32,10 @@ ; R600: -PV ; R600: -PV -; GCN: s_xor_b32 -; GCN: s_xor_b32 -; GCN: s_xor_b32 -; GCN: s_xor_b32 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { %fneg = fsub <4 x float> , %in store <4 x float> %fneg, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir --- a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir @@ -36,9 +36,10 @@ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF]], 0, implicit $exec - ; GCN: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF1]], 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_1]], implicit [[V_ADD_CO_U32_e64_2]] + ; GCN: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc + ; GCN: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF1]], implicit-def $vcc, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ADD_CO_U32_e32_1]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1344,16 +1344,14 @@ ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v6, v5 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1364,14 +1362,14 @@ ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5 ; SI-NEXT: v_mul_f32_e32 v6, v2, v5 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1398,8 +1396,6 @@ ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1411,14 +1407,14 @@ ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v6, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1429,14 +1425,14 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5 ; CI-NEXT: v_mul_f32_e32 v6, v2, v5 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6 ; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1595,16 +1591,14 @@ ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 ; SI-NEXT: v_rcp_f32_e32 v10, v9 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1615,14 +1609,14 @@ ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 ; SI-NEXT: v_rcp_f32_e32 v9, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9 ; SI-NEXT: v_mul_f32_e32 v10, v5, v9 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1632,14 +1626,14 @@ ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 ; SI-NEXT: v_rcp_f32_e32 v7, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7 ; SI-NEXT: v_mul_f32_e32 v8, v4, v7 ; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1649,14 +1643,14 @@ ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v7, v5, v5 ; SI-NEXT: v_mul_f32_e32 v7, v3, v5 ; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 ; SI-NEXT: v_fma_f32 v7, v8, v5, v7 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -1682,8 +1676,6 @@ ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1702,14 +1694,14 @@ ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 ; CI-NEXT: v_rcp_f32_e32 v10, v9 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; CI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1720,14 +1712,14 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_rcp_f32_e32 v9, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9 ; CI-NEXT: v_mul_f32_e32 v10, v5, v9 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10 ; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; CI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1737,14 +1729,14 @@ ; CI-NEXT: v_or_b32_e32 v1, v4, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 ; CI-NEXT: v_rcp_f32_e32 v7, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7 ; CI-NEXT: v_mul_f32_e32 v8, v4, v7 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8 ; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1754,14 +1746,14 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v7, v5, v5 ; CI-NEXT: v_mul_f32_e32 v7, v3, v5 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 ; CI-NEXT: v_fma_f32 v7, v8, v5, v7 ; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v3, v3 @@ -1965,16 +1957,14 @@ ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v6, v5 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1982,14 +1972,14 @@ ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5 ; SI-NEXT: v_mul_f32_e32 v6, v3, v5 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2014,20 +2004,18 @@ ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; CI-NEXT: v_rcp_f32_e32 v6, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -2035,14 +2023,14 @@ ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5 ; CI-NEXT: v_mul_f32_e32 v6, v3, v5 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6 ; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2054,8 +2042,6 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2071,14 +2057,14 @@ ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 ; VI-NEXT: v_rcp_f32_e32 v8, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; VI-NEXT: v_fma_f32 v8, v9, v8, v8 ; VI-NEXT: v_mul_f32_e32 v9, v6, v8 ; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; VI-NEXT: v_fma_f32 v9, v10, v8, v9 ; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 ; VI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2086,14 +2072,14 @@ ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 ; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; VI-NEXT: v_fma_f32 v7, v8, v7, v7 ; VI-NEXT: v_mul_f32_e32 v8, v5, v7 ; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; VI-NEXT: v_fma_f32 v8, v9, v7, v8 ; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 ; VI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2109,20 +2095,18 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 ; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 ; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 ; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -2130,14 +2114,14 @@ ; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 ; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -2219,16 +2203,14 @@ ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; SI-NEXT: v_rcp_f32_e32 v10, v9 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -2236,14 +2218,14 @@ ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v9, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9 ; SI-NEXT: v_mul_f32_e32 v10, v7, v9 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2251,14 +2233,14 @@ ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; SI-NEXT: v_rcp_f32_e32 v8, v7 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; SI-NEXT: v_fma_f32 v8, v9, v8, v8 ; SI-NEXT: v_mul_f32_e32 v9, v6, v8 ; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; SI-NEXT: v_fma_f32 v9, v10, v8, v9 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; SI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2266,14 +2248,14 @@ ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; SI-NEXT: v_rcp_f32_e32 v7, v6 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7 ; SI-NEXT: v_mul_f32_e32 v8, v5, v7 ; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2298,20 +2280,18 @@ ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; CI-NEXT: v_rcp_f32_e32 v10, v9 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; CI-NEXT: v_trunc_f32_e32 v8, v8 @@ -2319,14 +2299,14 @@ ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; CI-NEXT: v_rcp_f32_e32 v9, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9 ; CI-NEXT: v_mul_f32_e32 v10, v7, v9 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10 ; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2334,14 +2314,14 @@ ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; CI-NEXT: v_rcp_f32_e32 v8, v7 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; CI-NEXT: v_fma_f32 v8, v9, v8, v8 ; CI-NEXT: v_mul_f32_e32 v9, v6, v8 ; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; CI-NEXT: v_fma_f32 v9, v10, v8, v9 ; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; CI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2349,14 +2329,14 @@ ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; CI-NEXT: v_rcp_f32_e32 v7, v6 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7 ; CI-NEXT: v_mul_f32_e32 v8, v5, v7 ; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8 ; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; CI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2368,8 +2348,6 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -2385,14 +2363,14 @@ ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 ; VI-NEXT: v_rcp_f32_e32 v12, v11 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 ; VI-NEXT: v_fma_f32 v12, v13, v12, v12 ; VI-NEXT: v_mul_f32_e32 v13, v10, v12 ; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 ; VI-NEXT: v_fma_f32 v13, v14, v12, v13 ; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 ; VI-NEXT: v_trunc_f32_e32 v10, v10 @@ -2400,14 +2378,14 @@ ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; VI-NEXT: v_rcp_f32_e32 v11, v10 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; VI-NEXT: v_fma_f32 v11, v12, v11, v11 ; VI-NEXT: v_mul_f32_e32 v12, v7, v11 ; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 ; VI-NEXT: v_fma_f32 v12, v13, v11, v12 ; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; VI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2415,14 +2393,14 @@ ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; VI-NEXT: v_rcp_f32_e32 v10, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 ; VI-NEXT: v_fma_f32 v10, v11, v10, v10 ; VI-NEXT: v_mul_f32_e32 v11, v6, v10 ; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 ; VI-NEXT: v_fma_f32 v11, v12, v10, v11 ; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; VI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2430,14 +2408,14 @@ ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 ; VI-NEXT: v_fma_f32 v7, v10, v7, v7 ; VI-NEXT: v_mul_f32_e32 v10, v5, v7 ; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 ; VI-NEXT: v_fma_f32 v10, v11, v7, v10 ; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; VI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2453,20 +2431,18 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 ; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 ; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 ; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 @@ -2474,14 +2450,14 @@ ; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 ; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 ; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 ; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 @@ -2489,14 +2465,14 @@ ; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 ; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 ; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 ; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 ; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 ; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 @@ -2504,14 +2480,14 @@ ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 ; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 ; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 ; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 ; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -2636,16 +2612,15 @@ ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 -; SI-NEXT: s_movk_i32 s8, 0xfc01 -; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 ; SI-NEXT: v_not_b32_e32 v10, v10 ; SI-NEXT: v_and_b32_e32 v10, v8, v10 ; SI-NEXT: v_not_b32_e32 v11, v11 ; SI-NEXT: v_and_b32_e32 v11, v9, v11 -; SI-NEXT: s_brev_b32 s9, 1 -; SI-NEXT: v_and_b32_e32 v13, s9, v9 +; SI-NEXT: s_brev_b32 s8, 1 +; SI-NEXT: v_and_b32_e32 v13, s8, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 @@ -2669,13 +2644,13 @@ ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 -; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8 ; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 ; SI-NEXT: v_not_b32_e32 v8, v8 ; SI-NEXT: v_and_b32_e32 v8, v6, v8 ; SI-NEXT: v_not_b32_e32 v9, v9 ; SI-NEXT: v_and_b32_e32 v9, v7, v9 -; SI-NEXT: v_and_b32_e32 v11, s9, v7 +; SI-NEXT: v_and_b32_e32 v11, s8, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -716,10 +716,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -929,37 +928,36 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v12, 1, v12 -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9 +; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v1, v9, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v12 +; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8 +; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13 +; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret @@ -1241,14 +1239,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v6, s4 -; GFX10-NEXT: v_mul_hi_u32 v7, v7, s4 +; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -406,18 +406,17 @@ ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -429,7 +428,7 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: global_store_short v[5:6], v0, off @@ -442,16 +441,15 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s1, s4 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] @@ -491,17 +489,16 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s8, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX9-NEXT: s_movk_i32 s7, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 @@ -511,9 +508,9 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 ; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 @@ -527,16 +524,15 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s1, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] @@ -549,7 +545,7 @@ ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 ; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2104,16 +2104,15 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -213,6 +213,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -226,7 +227,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_and_b32_e32 v6, s0, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 @@ -315,7 +315,7 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -323,17 +323,17 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 @@ -1208,16 +1208,15 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s3, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1511,6 +1510,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1524,7 +1524,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 @@ -1613,7 +1612,7 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1625,16 +1624,16 @@ ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] @@ -1802,18 +1801,18 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 @@ -1901,6 +1900,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1914,7 +1914,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3 @@ -2015,7 +2014,7 @@ ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2023,27 +2022,27 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2519,7 +2519,6 @@ ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -2533,79 +2532,79 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v14 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v15 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v17 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v16 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, v4, v14 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v18, 12, v18 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v18 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v6, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2622,7 +2621,6 @@ ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -2635,79 +2633,79 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v15 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, v4, v14 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v18, 12, v18 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v18 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2355,7 +2355,6 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2369,49 +2368,49 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9 -; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -3112,7 +3111,6 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -3126,49 +3124,49 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9 -; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -341,12 +341,11 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x01,0x00,0x08] ; GFX9: buffer_store_dword [[REG]] -; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0x6400, v{{[0-9]+}} ; gfx8 does not support sreg or imm in sdwa - this will be move then -; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x6400 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -481,9 +481,8 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s6, s6, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; GCN-NEXT: s_mov_b32 s6, 0x1010101 -; GCN-NEXT: s_and_b32 s7, s5, s6 -; GCN-NEXT: s_and_b32 s6, s4, s6 +; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 +; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1072,9 +1072,8 @@ ; SI-NEXT: s_lshl_b32 s8, s6, 4 ; SI-NEXT: s_mov_b64 s[6:7], 0xffff ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; SI-NEXT: s_mov_b32 s8, 0x50005 -; SI-NEXT: s_and_b32 s9, s7, s8 -; SI-NEXT: s_and_b32 s8, s6, s8 +; SI-NEXT: s_and_b32 s9, s7, 0x50005 +; SI-NEXT: s_and_b32 s8, s6, 0x50005 ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s5 @@ -1248,9 +1247,8 @@ ; SI-NEXT: s_lshl_b32 s8, s8, 3 ; SI-NEXT: s_mov_b64 s[2:3], 0xffff ; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; SI-NEXT: s_mov_b32 s8, 0x5050505 -; SI-NEXT: s_and_b32 s9, s3, s8 -; SI-NEXT: s_and_b32 s8, s2, s8 +; SI-NEXT: s_and_b32 s9, s3, 0x5050505 +; SI-NEXT: s_and_b32 s8, s2, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] @@ -1272,9 +1270,8 @@ ; VI-NEXT: s_lshl_b32 s8, s8, 3 ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; VI-NEXT: s_mov_b32 s8, 0x5050505 -; VI-NEXT: s_and_b32 s9, s3, s8 -; VI-NEXT: s_and_b32 s8, s2, s8 +; VI-NEXT: s_and_b32 s9, s3, 0x5050505 +; VI-NEXT: s_and_b32 s8, s2, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1606,7 +1606,7 @@ ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s1, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_or_b32 s0, s4, s1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1633,7 +1633,7 @@ ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 -; CI-NEXT: s_and_b32 s4, s4, s2 +; CI-NEXT: s_and_b32 s4, s4, 0xffff ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_or_b32 s0, s4, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1691,7 +1691,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s1, s5, 4 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 ; VI-NEXT: s_or_b32 s2, s4, s5 @@ -1716,7 +1716,7 @@ ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s6, s4, s2 +; CI-NEXT: s_and_b32 s6, s4, 0xffff ; CI-NEXT: s_lshl_b32 s1, s5, 4 ; CI-NEXT: s_lshl_b32 s4, s4, 16 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -37,11 +37,10 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -566,10 +566,9 @@ ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -602,12 +601,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v2, v6 -; GFX10-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 @@ -653,10 +651,9 @@ ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -705,10 +702,9 @@ ; ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -759,10 +755,9 @@ ; ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -807,10 +802,9 @@ ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -857,10 +851,9 @@ ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -909,10 +902,9 @@ ; ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -963,10 +955,9 @@ ; ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -1160,15 +1151,14 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1197,15 +1187,14 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -62,11 +62,10 @@ ; ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -152,11 +151,10 @@ ; ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -203,11 +201,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -333,11 +330,10 @@ ; ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -384,11 +380,10 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -415,11 +410,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -446,11 +440,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -490,10 +483,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -505,9 +497,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -516,11 +507,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -534,10 +524,9 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -547,14 +536,13 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v0, v11, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v11, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -572,10 +560,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -587,9 +574,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -598,11 +584,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -620,10 +605,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -635,9 +619,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -646,11 +629,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -668,10 +650,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -685,11 +666,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -699,11 +679,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -721,10 +700,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -736,9 +714,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -747,11 +724,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -769,10 +745,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -784,9 +759,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -795,11 +769,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -817,10 +790,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -832,9 +804,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -843,11 +814,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -865,10 +835,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -882,11 +851,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -896,11 +864,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -913,14 +880,13 @@ ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -932,11 +898,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -949,14 +914,13 @@ ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -968,11 +932,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -15,9 +15,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] @@ -33,10 +32,9 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] @@ -60,9 +58,8 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -87,9 +84,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] @@ -116,11 +112,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -143,9 +138,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] @@ -170,9 +164,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -197,9 +190,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] @@ -226,11 +218,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -243,14 +234,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -263,14 +253,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -15,9 +15,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -33,10 +32,9 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -60,9 +58,8 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -87,9 +84,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -116,11 +112,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -143,9 +138,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -170,9 +164,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -197,9 +190,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -226,11 +218,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -243,14 +234,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -263,14 +253,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -56,11 +55,10 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -35,10 +35,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -60,11 +59,10 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -56,11 +55,10 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -35,10 +35,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -59,11 +58,10 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] @@ -53,11 +52,10 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -33,8 +33,8 @@ ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 -; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c -; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] +; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] @@ -49,7 +49,8 @@ ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -33,8 +33,8 @@ ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a -; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 -; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] +; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] @@ -48,8 +48,9 @@ ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]] ; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -147,21 +147,19 @@ ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s7, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s14, s0, s7 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14 -; SI-NEXT: s_brev_b32 s15, 1 +; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s15 -; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: s_and_b32 s0, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s14, 51 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -172,23 +170,23 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s7, s0, s7 -; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01 +; SI-NEXT: s_brev_b32 s7, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 -; SI-NEXT: v_bfi_b32 v4, s10, v6, v4 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 +; SI-NEXT: v_bfi_b32 v4, s7, v6, v4 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s15 +; SI-NEXT: s_and_b32 s0, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -200,7 +198,7 @@ ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v6, s10, v6, v7 +; SI-NEXT: v_bfi_b32 v6, s7, v6, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] @@ -245,22 +243,20 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 -; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 -; SI-NEXT: s_brev_b32 s20, 1 +; SI-NEXT: s_add_i32 s18, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s18 ; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] -; SI-NEXT: s_and_b32 s0, s7, s20 -; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_and_b32 s0, s7, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s18, 0 ; SI-NEXT: v_mov_b32_e32 v0, s17 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cmp_gt_i32 s18, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -271,7 +267,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 -; SI-NEXT: s_add_i32 s17, s0, s18 +; SI-NEXT: s_add_i32 s17, s0, 0xfffffc01 ; SI-NEXT: s_brev_b32 s16, -2 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 @@ -279,7 +275,7 @@ ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 ; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] -; SI-NEXT: s_and_b32 s0, s5, s20 +; SI-NEXT: s_and_b32 s0, s5, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s17, 0 @@ -298,12 +294,12 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] -; SI-NEXT: s_add_i32 s6, s0, s18 +; SI-NEXT: s_add_i32 s6, s0, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s20 +; SI-NEXT: s_and_b32 s0, s11, 0x80000000 ; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc @@ -321,13 +317,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s4, s0, s18 +; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v10, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: s_and_b32 s0, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 0 @@ -412,21 +408,20 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 ; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_movk_i32 s28, 0xfc01 ; SI-NEXT: s_mov_b32 s21, 0xfffff ; SI-NEXT: s_mov_b32 s20, s22 +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 -; SI-NEXT: s_add_i32 s23, s2, s28 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s23 -; SI-NEXT: s_brev_b32 s29, 1 +; SI-NEXT: s_add_i32 s26, s2, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 +; SI-NEXT: s_and_b32 s23, s7, 0x80000000 ; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] -; SI-NEXT: s_and_b32 s2, s7, s29 -; SI-NEXT: s_cmp_lt_i32 s23, 0 +; SI-NEXT: s_cmp_lt_i32 s26, 0 ; SI-NEXT: v_mov_b32_e32 v0, s25 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s23, 51 +; SI-NEXT: s_cmp_gt_i32 s26, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -437,15 +432,14 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s24, s2, s28 +; SI-NEXT: s_add_i32 s24, s2, 0xfffffc01 ; SI-NEXT: s_brev_b32 s23, -2 -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 ; SI-NEXT: v_bfi_b32 v4, s23, v8, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s2, s5, s29 +; SI-NEXT: s_and_b32 s2, s5, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s24, 0 @@ -464,13 +458,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, s28 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_bfi_b32 v6, s23, v8, v6 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, s29 +; SI-NEXT: s_and_b32 s2, s11, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 @@ -489,13 +483,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, s28 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v9, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_bfi_b32 v9, s23, v8, v9 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, s29 +; SI-NEXT: s_and_b32 s2, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 @@ -514,12 +508,12 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] ; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 ; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] -; SI-NEXT: s_add_i32 s4, s2, s28 +; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 ; SI-NEXT: v_mov_b32_e32 v11, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 ; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, s29 +; SI-NEXT: s_and_b32 s2, s15, 0x80000000 ; SI-NEXT: v_bfi_b32 v11, s23, v8, v11 ; SI-NEXT: s_cmp_lt_i32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc @@ -530,10 +524,10 @@ ; SI-NEXT: v_mov_b32_e32 v10, s2 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, s28 +; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6 ; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5] -; SI-NEXT: s_and_b32 s4, s13, s29 +; SI-NEXT: s_and_b32 s4, s13, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc @@ -542,30 +536,30 @@ ; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 -; SI-NEXT: s_add_i32 s25, s8, s28 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s25 -; SI-NEXT: s_andn2_b64 s[10:11], s[18:19], s[8:9] -; SI-NEXT: s_and_b32 s8, s19, s29 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 +; SI-NEXT: s_andn2_b64 s[28:29], s[18:19], s[8:9] +; SI-NEXT: s_and_b32 s8, s19, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: s_cmp_lt_i32 s25, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5] -; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s25, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] ; SI-NEXT: v_mov_b32_e32 v10, s19 -; SI-NEXT: v_mov_b32_e32 v11, s10 ; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11] -; SI-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[8:9] +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[8:9] ; SI-NEXT: v_mov_b32_e32 v11, s18 ; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11] -; SI-NEXT: s_add_i32 s10, s8, s28 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 ; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] -; SI-NEXT: s_and_b32 s8, s17, s29 +; SI-NEXT: s_and_b32 s8, s17, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v11, s21 ; SI-NEXT: v_mov_b32_e32 v12, s8 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1027,31 +1027,29 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s6 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s3, s4 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s4 +; GCN-HSA-NEXT: s_and_b32 s1, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 @@ -1061,21 +1059,18 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s3, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v3i16_to_v3i32: @@ -1202,12 +1197,11 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 @@ -1219,7 +1213,6 @@ ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1227,8 +1220,8 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s3, s3, s4 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s4 +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -1239,23 +1232,20 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v4i16_to_v4i32: @@ -1396,26 +1386,25 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1424,22 +1413,21 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s6, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1455,33 +1443,30 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s9, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s10, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i16_to_v8i32: @@ -1662,46 +1647,45 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1710,7 +1694,6 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s4, 16 @@ -1718,24 +1701,24 @@ ; GCN-HSA-NEXT: s_lshr_b32 s15, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s2 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s2 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1766,46 +1749,43 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s13 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s14 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2097,282 +2077,277 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s1, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s0, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s3, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s2, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s9, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s8, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s11, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s10, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s13, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s12, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s15, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s14, s18 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff +; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s19, s1, s18 -; GCN-HSA-NEXT: s_and_b32 s20, s0, s18 -; GCN-HSA-NEXT: s_and_b32 s21, s3, s18 -; GCN-HSA-NEXT: s_and_b32 s22, s2, s18 -; GCN-HSA-NEXT: s_and_b32 s23, s5, s18 -; GCN-HSA-NEXT: s_and_b32 s24, s4, s18 -; GCN-HSA-NEXT: s_and_b32 s25, s7, s18 -; GCN-HSA-NEXT: s_and_b32 s26, s6, s18 -; GCN-HSA-NEXT: s_and_b32 s27, s9, s18 -; GCN-HSA-NEXT: s_and_b32 s28, s8, s18 -; GCN-HSA-NEXT: s_and_b32 s29, s11, s18 -; GCN-HSA-NEXT: s_and_b32 s30, s10, s18 -; GCN-HSA-NEXT: s_and_b32 s31, s13, s18 -; GCN-HSA-NEXT: s_and_b32 s33, s12, s18 -; GCN-HSA-NEXT: s_and_b32 s34, s15, s18 -; GCN-HSA-NEXT: s_and_b32 s18, s14, s18 -; GCN-HSA-NEXT: s_lshr_b32 s35, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s18, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s12 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[22:23], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s22, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s20 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s21 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s12, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s34 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s0, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2884,101 +2859,100 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s1, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s0, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s3, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s55, s2, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s56, s37, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s57, s36, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s58, s39, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s59, s38, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s60, s41, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s61, s40, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s62, s43, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s63, s42, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s64, s45, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s65, s44, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s66, s47, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s67, s46, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s68, s49, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s69, s48, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s70, s51, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s50, s20 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s37, s37, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s36, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s39, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s38, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s41, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s40, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s42, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s45, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s44, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s47, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s46, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s49, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s48, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s51, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s50, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s43, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s37, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s36, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s39, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s38, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s41, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s40, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s43, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s42, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s45, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s44, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s47, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s46, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s49, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s48, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s51, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s50, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s37, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s36, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s39, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s38, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s40, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s43, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s42, s42, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s45, s45, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s44, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s47, s47, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s46, s46, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s49, s49, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s48, s48, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s51, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s50, s50, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s41, s41, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 @@ -2986,63 +2960,63 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3050,7 +3024,6 @@ ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s53, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 @@ -3071,54 +3044,54 @@ ; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s52, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s53 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s53 -; GCN-HSA-NEXT: s_and_b32 s3, s3, s53 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s53 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s53 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s53 -; GCN-HSA-NEXT: s_and_b32 s54, s7, s53 -; GCN-HSA-NEXT: s_and_b32 s55, s6, s53 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s53 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s53 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s53 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s53 -; GCN-HSA-NEXT: s_and_b32 s13, s13, s53 -; GCN-HSA-NEXT: s_and_b32 s12, s12, s53 -; GCN-HSA-NEXT: s_and_b32 s15, s15, s53 -; GCN-HSA-NEXT: s_and_b32 s14, s14, s53 -; GCN-HSA-NEXT: s_and_b32 s18, s37, s53 -; GCN-HSA-NEXT: s_and_b32 s19, s36, s53 -; GCN-HSA-NEXT: s_and_b32 s56, s39, s53 -; GCN-HSA-NEXT: s_and_b32 s57, s38, s53 -; GCN-HSA-NEXT: s_and_b32 s58, s41, s53 -; GCN-HSA-NEXT: s_and_b32 s59, s40, s53 -; GCN-HSA-NEXT: s_and_b32 s60, s43, s53 -; GCN-HSA-NEXT: s_and_b32 s61, s42, s53 -; GCN-HSA-NEXT: s_and_b32 s62, s45, s53 -; GCN-HSA-NEXT: s_and_b32 s63, s44, s53 -; GCN-HSA-NEXT: s_and_b32 s64, s47, s53 -; GCN-HSA-NEXT: s_and_b32 s65, s46, s53 -; GCN-HSA-NEXT: s_and_b32 s66, s49, s53 -; GCN-HSA-NEXT: s_and_b32 s67, s48, s53 -; GCN-HSA-NEXT: s_and_b32 s68, s51, s53 -; GCN-HSA-NEXT: s_and_b32 s53, s50, s53 -; GCN-HSA-NEXT: s_lshr_b32 s37, s37, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s36, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s39, 16 -; GCN-HSA-NEXT: s_lshr_b32 s38, s38, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s43, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s42, 16 -; GCN-HSA-NEXT: s_lshr_b32 s45, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s44, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s47, s47, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s46, 16 -; GCN-HSA-NEXT: s_lshr_b32 s49, s49, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s48, 16 -; GCN-HSA-NEXT: s_lshr_b32 s51, s51, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s50, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s53, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s54, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s18, s37, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s36, 16 +; GCN-HSA-NEXT: s_lshr_b32 s55, s39, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s38, 16 +; GCN-HSA-NEXT: s_lshr_b32 s57, s41, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s40, 16 +; GCN-HSA-NEXT: s_lshr_b32 s59, s43, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s42, 16 +; GCN-HSA-NEXT: s_lshr_b32 s61, s45, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s44, 16 +; GCN-HSA-NEXT: s_lshr_b32 s63, s47, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s46, 16 +; GCN-HSA-NEXT: s_lshr_b32 s65, s49, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 16 +; GCN-HSA-NEXT: s_lshr_b32 s67, s51, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s50, 16 +; GCN-HSA-NEXT: s_and_b32 s37, s37, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s36, 0xffff +; GCN-HSA-NEXT: s_and_b32 s39, s39, 0xffff +; GCN-HSA-NEXT: s_and_b32 s38, s38, 0xffff +; GCN-HSA-NEXT: s_and_b32 s41, s41, 0xffff +; GCN-HSA-NEXT: s_and_b32 s40, s40, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s43, 0xffff +; GCN-HSA-NEXT: s_and_b32 s42, s42, 0xffff +; GCN-HSA-NEXT: s_and_b32 s45, s45, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s44, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s47, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s46, 0xffff +; GCN-HSA-NEXT: s_and_b32 s49, s49, 0xffff +; GCN-HSA-NEXT: s_and_b32 s48, s48, 0xffff +; GCN-HSA-NEXT: s_and_b32 s51, s51, 0xffff +; GCN-HSA-NEXT: s_and_b32 s50, s50, 0xffff ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 @@ -3144,10 +3117,10 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 @@ -3157,47 +3130,47 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 @@ -3230,9 +3203,9 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3268,160 +3241,147 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s20, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s23, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s22, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s25, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s24, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s27, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s26, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s29, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s28, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s31, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s30, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s40, s14, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s69, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s52, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s54, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s3, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s56, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s58, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s56 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s52 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s29, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s28, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s27, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s26, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s25, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s24, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s23, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s22, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s20, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 @@ -4847,52 +4807,48 @@ ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s5, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s2, s4 -; GCN-HSA-NEXT: s_and_b32 s2, s3, s4 +; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s3, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4900,19 +4856,18 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 @@ -5090,39 +5045,36 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s8, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s9, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5133,19 +5085,18 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s7, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s7, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -5173,39 +5124,35 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s8, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s9, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s10, s6 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s11, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5464,39 +5411,36 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s14 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 @@ -5508,19 +5452,19 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5531,27 +5475,26 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s2 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s2 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s9, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s9, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -5607,50 +5550,47 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s13 +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s6, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s7, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s9, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s11, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s10, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s8, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s7, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s6, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 @@ -6109,353 +6049,346 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s15, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s0, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s2, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s4, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s6, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s8, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s12, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s14, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s1, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s18 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff +; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s19, s0, s18 -; GCN-HSA-NEXT: s_and_b32 s20, s2, s18 -; GCN-HSA-NEXT: s_and_b32 s21, s4, s18 -; GCN-HSA-NEXT: s_and_b32 s22, s6, s18 -; GCN-HSA-NEXT: s_and_b32 s23, s8, s18 -; GCN-HSA-NEXT: s_and_b32 s24, s10, s18 -; GCN-HSA-NEXT: s_and_b32 s25, s12, s18 -; GCN-HSA-NEXT: s_and_b32 s26, s14, s18 -; GCN-HSA-NEXT: s_and_b32 s27, s1, s18 -; GCN-HSA-NEXT: s_and_b32 s28, s3, s18 -; GCN-HSA-NEXT: s_and_b32 s29, s5, s18 -; GCN-HSA-NEXT: s_and_b32 s30, s7, s18 -; GCN-HSA-NEXT: s_and_b32 s31, s9, s18 -; GCN-HSA-NEXT: s_and_b32 s33, s11, s18 -; GCN-HSA-NEXT: s_and_b32 s34, s13, s18 -; GCN-HSA-NEXT: s_and_b32 s18, s15, s18 -; GCN-HSA-NEXT: s_lshr_b32 s35, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[22:23], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s22, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s20 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s21 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s2, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s3, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s4, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s5, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s6, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s7, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s8, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s9, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s10, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s11, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s13, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s14, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s15, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s15, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s14, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s13, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s12, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s11, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s10, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s9, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s8, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s7, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s6, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s5, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s3, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s2, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3632,24 +3632,21 @@ ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s12, 0x50 -; GCN-HSA-NEXT: s_movk_i32 s13, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s14, 0x70 -; GCN-HSA-NEXT: s_mov_b32 s15, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s12 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s13 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s14 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 @@ -3657,12 +3654,12 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] @@ -3679,8 +3676,8 @@ ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s15, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s15, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s12, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s12, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] @@ -3698,16 +3695,16 @@ ; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s15, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s15, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s12, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s12, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 @@ -3717,62 +3714,62 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s13 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v18 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s14 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 @@ -3780,47 +3777,47 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s12 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s15, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s15, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s12, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s12, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v20 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v22 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s15, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s15, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s12, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s12, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4415,44 +4412,41 @@ ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s7, 0x70 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s6, 0x50 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s2, 32 -; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) @@ -4505,37 +4499,37 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 @@ -4545,7 +4539,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 @@ -4554,7 +4548,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s7 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 @@ -4567,7 +4561,7 @@ ; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v33 @@ -6590,8 +6584,8 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -6604,8 +6598,7 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 @@ -6613,50 +6606,53 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s6, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -7523,70 +7519,74 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s18, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[5:8] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v19 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v17 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[5:8] -; GCN-HSA-NEXT: v_and_b32_e32 v11, s18, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[5:8] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s18, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s18, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s18, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v17, s18, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s18, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s18, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -41,7 +41,6 @@ ; GCN-LABEL: {{^}}madak_2_use_f32: ; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 @@ -54,7 +53,8 @@ ; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] -; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] +; GFX10-FMA-DAG:v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000 +; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] ; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -228,7 +228,9 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-DAG: s_and_b32 [[A16:s[0-9]+]], [[A]], 0xffff +; SI-DAG: s_and_b32 [[B16:s[0-9]+]], [[B]], 0xffff +; SI: s_max_u32 [[MAX:s[0-9]+]], [[A16]], [[B16]] ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI: buffer_store_dword [[VMAX]] diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -73,7 +73,7 @@ ; FUNC-LABEL: {{^}}mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; SI-DAG: s_mul_i32 +; SI-DAG: s_mulk_i32 ; SI-DAG: v_mul_hi_i32 ; VI: v_mad_i64_i32 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -12,11 +12,10 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, s2 -; SI-NEXT: s_and_b32 s2, s5, s2 -; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_and_b32 s2, s4, 0xffffff +; SI-NEXT: s_and_b32 s4, s5, 0xffffff +; SI-NEXT: s_mul_i32 s4, s2, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -26,12 +25,11 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0xffffff ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, s6 -; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffffff +; VI-NEXT: s_and_b32 s5, s5, 0xffffff ; VI-NEXT: s_mul_i32 s4, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,13 +39,12 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -405,13 +402,12 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -466,12 +462,11 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s5, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -494,12 +489,11 @@ ; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s8, 0xffffff ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b32 s4, s6, s8 +; SI-NEXT: s_and_b32 s4, s6, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s5, s7, s8 +; SI-NEXT: s_and_b32 s5, s7, 0xffffff ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_mul_i32 s4, s4, s5 ; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0 @@ -531,14 +525,13 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s5, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -638,12 +631,11 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0xffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, s2 -; SI-NEXT: s_and_b32 s2, s5, s2 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s2, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -653,12 +645,11 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0xffff ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, s6 -; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_mul_i32 s4, s4, s5 ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -669,12 +660,11 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -696,15 +686,14 @@ ; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s3, s2, s1 -; SI-NEXT: s_and_b32 s1, s0, s1 +; SI-NEXT: s_and_b32 s1, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s0, 0xffffff ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 -; SI-NEXT: s_mul_i32 s3, s3, s1 +; SI-NEXT: s_mul_i32 s1, s1, s3 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -725,17 +714,16 @@ ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s2, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s2, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -783,16 +771,15 @@ ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -80,8 +80,8 @@ ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]] -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -565,8 +565,7 @@ } ; GCN-LABEL: {{^}}fneg_v2f32_scalar: -; GCN: s_brev_b32 [[SIGN:s[0-9]+]], 1 -; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGN]] +; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) { %fneg = fsub <2 x float> , %x store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1715,22 +1715,28 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s39 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s38, v2 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v0 +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x1800, v0 +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0 +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12 +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s0, v12 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12 +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v12 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3800 ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v12 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(4) @@ -1780,16 +1786,14 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x2000 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 ; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v12 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v12 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x3000, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v13, vcc ; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:2048 ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[4:5], off diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -16,11 +16,9 @@ ret void } -; FIXME: This should be folded with any number of uses. ; SI-LABEL: {{^}}s_addk_i32_k0_x2: -; SI: s_movk_i32 [[K:s[0-9]+]], 0x41 -; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] -; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -483,10 +483,9 @@ } ; GCN-LABEL: {{^}}phi_imm_in_sgprs -; GCN: s_movk_i32 [[A:s[0-9]+]], 0x400 ; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400 ; GCN: [[LOOP_LABEL:.L[0-9a-zA-Z_]+]]: -; GCN: s_xor_b32 [[B]], [[B]], [[A]] +; GCN: s_xor_b32 [[B]], [[B]], 0x400 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]] define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -48,9 +48,8 @@ ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} -; This constant isn't folded, because it has multiple uses. ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 -; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] +; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -21,79 +21,78 @@ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s12, s3, 31 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: s_add_u32 s2, s2, s12 ; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 @@ -186,12 +185,11 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16 -; GCN-IR-NEXT: s_add_u32 s20, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_add_u32 s19, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s12, s8, s18 -; GCN-IR-NEXT: s_mov_b32 s19, s15 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s15 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -201,8 +199,8 @@ ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s19, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s17 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 @@ -211,9 +209,9 @@ ; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 @@ -257,7 +255,6 @@ ; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GCN-NEXT: v_trunc_f32_e32 v6, v6 @@ -273,7 +270,7 @@ ; GCN-NEXT: v_mul_lo_u32 v11, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v5, v10 ; GCN-NEXT: v_mul_hi_u32 v13, v5, v9 -; GCN-NEXT: v_mul_hi_u32 v15, v6, v9 +; GCN-NEXT: v_mul_hi_u32 v14, v6, v9 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v9 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc @@ -281,7 +278,7 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v15, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -303,7 +300,7 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v6, v8 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -324,7 +321,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v1, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 @@ -398,12 +395,10 @@ ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v17 ; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 @@ -423,10 +418,10 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v0 ; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, v17 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v16, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 @@ -1034,12 +1029,11 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16 -; GCN-IR-NEXT: s_add_u32 s20, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_add_u32 s19, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s12, s8, s18 -; GCN-IR-NEXT: s_mov_b32 s19, s15 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s15 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while @@ -1049,8 +1043,8 @@ ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s19, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s17 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 @@ -1059,9 +1053,9 @@ ; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow3 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 @@ -1111,58 +1105,57 @@ ; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1310,7 +1303,6 @@ ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1326,7 +1318,7 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc @@ -1334,7 +1326,7 @@ ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1356,7 +1348,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -1417,25 +1409,24 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[5:6] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1445,10 +1436,10 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1481,12 +1472,12 @@ ; GCN-IR-NEXT: .LBB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 ; GCN-IR-NEXT: .LBB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1509,7 +1500,6 @@ ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1525,7 +1515,7 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc @@ -1533,7 +1523,7 @@ ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1555,7 +1545,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -1612,27 +1602,26 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v7, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1643,10 +1632,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1679,12 +1668,12 @@ ; GCN-IR-NEXT: .LBB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 ; GCN-IR-NEXT: .LBB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -516,13 +516,15 @@ ; NOSDWA-NOT: v_or_b32_sdwa ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -156,12 +156,12 @@ ; FUNC-LABEL: {{^}}cmp_zext_k_i8max: ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff -; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] -; SI: s_cmp_lg_u32 [[B]], [[K255]] +; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff +; SI: s_cmpk_lg_i32 [[B]], 0xff ; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0 -; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] +; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff +; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff ; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]] ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]] @@ -208,9 +208,8 @@ ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff -; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] -; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}} +; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], 0xff +; GCN: s_cmpk_lg_i32 [[B]], 0xff{{$}} ; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_byte [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -149,9 +149,9 @@ ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, 0x41 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -21,17 +21,16 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0xffff ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s7, s4, s6 +; VI-NEXT: s_and_b32 s6, s4, 0xffff ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s8, s5, 16 -; VI-NEXT: s_lshl_b32 s4, s4, s8 -; VI-NEXT: s_lshl_b32 s5, s7, s5 +; VI-NEXT: s_lshr_b32 s7, s5, 16 +; VI-NEXT: s_lshl_b32 s4, s4, s7 +; VI-NEXT: s_lshl_b32 s5, s6, s5 ; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1822,18 +1822,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0xc400 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0xc400, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1895,18 +1894,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0x4400 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0x4400, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1968,18 +1966,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0x4000 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0x4000, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -2041,18 +2038,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0xc000 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0xc000, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,14 +31,13 @@ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (load (s64) from %ir.40, addrspace 4) - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc + ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (load (s128) from %ir.84, addrspace 4) @@ -195,7 +194,7 @@ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (load (s128) from %ir.230, addrspace 4) ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (load (s128) from %ir.236, addrspace 4) - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; CHECK-NEXT: [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32)) @@ -211,7 +210,7 @@ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (load (s64) from %ir.320, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) @@ -231,7 +230,7 @@ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] - ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] ; CHECK-NEXT: [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -17,72 +17,71 @@ ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 @@ -161,9 +160,8 @@ ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 ; GCN-IR-NEXT: s_add_u32 s10, s6, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s7, s11 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -234,7 +232,6 @@ ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -250,7 +247,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -258,7 +255,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -280,7 +277,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -301,7 +298,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v1, v5 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 @@ -354,101 +351,99 @@ ; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v2, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v3, v6, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v2 +; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 +; GCN-IR-NEXT: v_min_u32_e32 v10, v6, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v0 +; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 +; GCN-IR-NEXT: v_min_u32_e32 v11, v6, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[6:7], v10, v11 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v5 -; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v6 -; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v0 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v1 -; GCN-IR-NEXT: v_min_u32_e32 v12, v7, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v3, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 -; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[6:7] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v0, 0, s[6:7] +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[7:8] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[7:8] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v7 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v6, vcc -; GCN-IR-NEXT: v_not_b32_e32 v3, v3 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, v11 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v9, v10 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 +; GCN-IR-NEXT: v_not_b32_e32 v8, 0 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v3, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v14, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v3 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v11 -; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 -; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v6 -; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[11:12] -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v3, v13 -; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v13, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v14, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v14 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v15, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 ; GCN-IR-NEXT: .LBB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v5, v10 -; GCN-IR-NEXT: v_mul_hi_u32 v7, v5, v9 -; GCN-IR-NEXT: v_mul_lo_u32 v6, v6, v9 -; GCN-IR-NEXT: v_mul_lo_u32 v5, v5, v9 +; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v9 +; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %x, %y ret i64 %result @@ -862,29 +857,28 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], 31 -; GCN-NEXT: s_ashr_i64 s[4:5], s[0:1], 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 ; GCN-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-NEXT: s_add_u32 s4, s4, s0 +; GCN-NEXT: s_add_u32 s8, s8, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s5, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[0:1] +; GCN-NEXT: s_addc_u32 s9, s9, s0 +; GCN-NEXT: s_xor_b64 s[12:13], s[8:9], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GCN-NEXT: s_sub_u32 s0, 0, s12 ; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_ashr_i32 s10, s11, 31 +; GCN-NEXT: s_ashr_i32 s6, s7, 31 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: s_mov_b32 s5, s9 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -899,16 +893,16 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -919,25 +913,25 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s0, s2, s10 +; GCN-NEXT: s_add_u32 s0, s2, s6 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s1, s3, s10 +; GCN-NEXT: s_addc_u32 s1, s3, s6 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] +; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] ; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 ; GCN-NEXT: v_mul_hi_u32 v4, s14, v1 @@ -949,7 +943,7 @@ ; GCN-NEXT: v_mul_hi_u32 v0, s15, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 @@ -987,12 +981,12 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem33_64: @@ -1047,9 +1041,8 @@ ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 ; GCN-IR-NEXT: s_add_u32 s12, s6, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s7, s13 +; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while @@ -1205,9 +1198,8 @@ ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 ; GCN-IR-NEXT: s_add_u32 s12, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s13 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while @@ -1288,58 +1280,57 @@ ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1483,7 +1474,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1499,7 +1489,7 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc @@ -1507,7 +1497,7 @@ ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1529,7 +1519,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1585,24 +1575,23 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1612,10 +1601,10 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1647,17 +1636,17 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 ; GCN-IR-NEXT: .LBB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1680,7 +1669,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1696,7 +1684,7 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc @@ -1704,7 +1692,7 @@ ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1726,7 +1714,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1778,26 +1766,25 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1808,10 +1795,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1843,15 +1830,15 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -186,13 +186,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_add_f16_e32 v5, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -132,14 +132,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 ; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v0, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -186,13 +186,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_mul_f16_e32 v5, v0, v2 -; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_mul_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -206,13 +206,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_sub_f16_e32 v5, v0, v2 -; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_sub_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -626,7 +626,7 @@ ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, v3, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -33,11 +33,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v0, s4 +; GFX10-NEXT: v_min_u16 v0, 0xff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -500,7 +500,6 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -510,8 +509,8 @@ ; GFX1030-NEXT: v_sub_nc_u32_e32 v8, 0, v3 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX1030-NEXT: v_mul_f32_e32 v5, s0, v5 -; GFX1030-NEXT: v_mul_f32_e32 v6, s0, v6 +; GFX1030-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; GFX1030-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX1030-NEXT: v_mul_lo_u32 v7, v7, v5 @@ -893,7 +892,6 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 @@ -911,10 +909,10 @@ ; GFX1030-NEXT: v_sub_nc_u32_e32 v14, 0, v1 ; GFX1030-NEXT: v_sub_nc_u32_e32 v15, 0, v2 ; GFX1030-NEXT: v_sub_nc_u32_e32 v16, 0, v3 -; GFX1030-NEXT: v_mul_f32_e32 v9, s0, v9 -; GFX1030-NEXT: v_mul_f32_e32 v10, s0, v10 -; GFX1030-NEXT: v_mul_f32_e32 v11, s0, v11 -; GFX1030-NEXT: v_mul_f32_e32 v12, s0, v12 +; GFX1030-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GFX1030-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GFX1030-NEXT: v_mul_f32_e32 v11, 0x4f7ffffe, v11 +; GFX1030-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v11, v11 @@ -2154,16 +2152,15 @@ ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] -; GFX1030-NEXT: s_mov_b32 s0, 0x1389c755 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mul_hi_u32 v0, v0, s0 -; GFX1030-NEXT: v_mul_hi_u32 v1, v1, s0 -; GFX1030-NEXT: v_mul_hi_u32 v2, v2, s0 -; GFX1030-NEXT: v_mul_hi_u32 v3, v3, s0 +; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0 +; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1 +; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2 +; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2 @@ -2499,7 +2496,6 @@ ; SI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; SI-NEXT: v_rcp_f32_e32 v2, v2 ; SI-NEXT: s_mov_b32 s4, 0xfffe7960 -; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2513,16 +2509,16 @@ ; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v5 ; SI-NEXT: v_mul_lo_u32 v6, v2, v4 -; SI-NEXT: v_mul_hi_u32 v9, v2, v4 -; SI-NEXT: v_mul_hi_u32 v10, v3, v4 +; SI-NEXT: v_mul_hi_u32 v8, v2, v4 +; SI-NEXT: v_mul_hi_u32 v9, v3, v4 ; SI-NEXT: v_mul_lo_u32 v4, v3, v4 ; SI-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; SI-NEXT: v_mul_lo_u32 v9, v3, v5 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; SI-NEXT: v_mul_lo_u32 v8, v3, v5 ; SI-NEXT: v_mul_hi_u32 v5, v3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2535,16 +2531,16 @@ ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; SI-NEXT: v_mul_lo_u32 v5, v2, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v6 -; SI-NEXT: v_mul_hi_u32 v9, v2, v4 -; SI-NEXT: v_mul_hi_u32 v10, v3, v4 +; SI-NEXT: v_mul_hi_u32 v8, v2, v4 +; SI-NEXT: v_mul_hi_u32 v9, v3, v4 ; SI-NEXT: v_mul_lo_u32 v4, v3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; SI-NEXT: v_mul_lo_u32 v9, v3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; SI-NEXT: v_mul_lo_u32 v8, v3, v6 ; SI-NEXT: v_mul_hi_u32 v6, v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2560,18 +2556,18 @@ ; SI-NEXT: v_mul_hi_u32 v2, v1, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v8, vcc +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; SI-NEXT: v_mul_lo_u32 v4, v3, s4 ; SI-NEXT: v_mul_hi_u32 v5, v2, s4 ; SI-NEXT: v_mul_lo_u32 v6, v2, s4 +; SI-NEXT: s_mov_b32 s4, 0x1869f ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; SI-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 +; SI-NEXT: v_subrev_i32_e32 v4, vcc, 0x186a0, v0 ; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc -; SI-NEXT: s_mov_b32 s4, 0x1869f ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2599,7 +2595,6 @@ ; VI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; VI-NEXT: v_rcp_f32_e32 v2, v2 ; VI-NEXT: s_mov_b32 s6, 0xfffe7960 -; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; VI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; VI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2612,31 +2607,30 @@ ; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 -; VI-NEXT: v_add_u32_e32 v10, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v4, vcc +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 -; VI-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v11, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: s_mov_b32 s6, 0x186a0 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_mul_hi_u32 v8, v6, v2 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 @@ -2649,16 +2643,17 @@ ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; VI-NEXT: v_mul_lo_u32 v6, v5, s6 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 +; VI-NEXT: s_mov_b32 s4, 0x186a0 +; VI-NEXT: v_mul_lo_u32 v6, v5, s4 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0 ; VI-NEXT: s_mov_b32 s4, 0x1869f ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0 ; VI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc @@ -2687,7 +2682,6 @@ ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_mov_b32 s6, 0xfffe7960 -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -2700,31 +2694,30 @@ ; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 -; GCN-NEXT: v_add_u32_e32 v10, vcc, v5, v3 +; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 -; GCN-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v11, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: s_mov_b32 s6, 0x186a0 ; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 ; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, v6, v2 @@ -2737,16 +2730,17 @@ ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v5, s6 -; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 +; GCN-NEXT: s_mov_b32 s4, 0x186a0 +; GCN-NEXT: v_mul_lo_u32 v6, v5, s4 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0 ; GCN-NEXT: s_mov_b32 s4, 0x1869f ; GCN-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 +; GCN-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc @@ -2773,15 +2767,14 @@ ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-NEXT: s_mov_b32 s4, 0x346d900 -; GFX1030-NEXT: s_mov_b32 s5, 0xfffe7960 ; GFX1030-NEXT: s_add_u32 s4, 0x4237, s4 -; GFX1030-NEXT: s_addc_u32 s6, 0, 0 +; GFX1030-NEXT: s_addc_u32 s5, 0, 0 ; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4 ; GFX1030-NEXT: s_cmpk_lg_u32 s4, 0x0 -; GFX1030-NEXT: s_addc_u32 s4, s6, 0xa7c5 -; GFX1030-NEXT: v_mul_hi_u32 v3, v2, s5 -; GFX1030-NEXT: v_mul_lo_u32 v4, v2, s5 -; GFX1030-NEXT: s_mul_i32 s5, s4, s5 +; GFX1030-NEXT: s_addc_u32 s4, s5, 0xa7c5 +; GFX1030-NEXT: v_mul_hi_u32 v3, 0xfffe7960, v2 +; GFX1030-NEXT: v_mul_lo_u32 v4, 0xfffe7960, v2 +; GFX1030-NEXT: s_mul_i32 s5, s4, 0xfffe7960 ; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v2 ; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v4 ; GFX1030-NEXT: v_mul_hi_u32 v8, s4, v4 @@ -2804,7 +2797,6 @@ ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v5, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, v0, v6, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[6:7], s4, v1, v6, 0 -; GFX1030-NEXT: s_mov_b32 s4, 0x186a0 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -2812,20 +2804,19 @@ ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s5, v4, s4, 0 -; GFX1030-NEXT: v_mul_lo_u32 v6, v5, s4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, 0x186a0, v4, 0 +; GFX1030-NEXT: v_mul_lo_u32 v6, 0x186a0, v5 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1030-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 -; GFX1030-NEXT: s_mov_b32 s4, 0x1869f +; GFX1030-NEXT: v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0 ; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v2 +; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2 +; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 ; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v6, vcc_lo, v4, 2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo -; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v0 -; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 +; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -16,73 +16,72 @@ ; GCN-NEXT: s_subb_u32 s5, 0, s9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 @@ -159,12 +158,11 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s12 -; GCN-IR-NEXT: s_add_u32 s16, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 +; GCN-IR-NEXT: s_add_u32 s15, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] ; GCN-IR-NEXT: s_add_u32 s2, s2, s14 -; GCN-IR-NEXT: s_mov_b32 s15, s11 -; GCN-IR-NEXT: s_addc_u32 s3, s3, s11 +; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -174,8 +172,8 @@ ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 +; GCN-IR-NEXT: s_sub_u32 s6, s15, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s16, s13 ; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s6, s10, 1 @@ -184,9 +182,9 @@ ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 @@ -219,7 +217,6 @@ ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -235,7 +232,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -243,7 +240,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -265,7 +262,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -281,7 +278,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v2, v5 @@ -323,35 +320,33 @@ ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -359,25 +354,25 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v8 -; GCN-IR-NEXT: v_not_b32_e32 v1, v9 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-IR-NEXT: v_not_b32_e32 v1, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 @@ -387,8 +382,8 @@ ; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v9, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v9, s[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 @@ -679,52 +674,50 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_load_dword s4, s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s6, 0xffff -; GCN-NEXT: s_mov_b32 s7, 0xff000000 -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s6 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xc +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, 0xffff +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s2, s6 -; GCN-NEXT: s_and_b32 s2, s4, s7 +; GCN-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-NEXT: s_and_b32 s2, s4, 0xff000000 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_load_dword s8, s[0:1], 0xb -; GCN-NEXT: s_load_dword s9, s[0:1], 0xc ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_and_b32 s8, s6, 0xffff +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s9, s6 -; GCN-NEXT: s_and_b32 s8, s8, s7 +; GCN-NEXT: s_and_b32 s9, s0, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 ; GCN-NEXT: s_sub_u32 s0, 0, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_subb_u32 s1, 0, s1 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v1 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s0, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v8, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -736,32 +729,32 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 +; GCN-NEXT: v_alignbit_b32 v3, s8, v3, 24 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 ; GCN-NEXT: v_mul_hi_u32 v2, v3, v2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -797,26 +790,24 @@ ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s8, 0xffff +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s2, s8 -; GCN-IR-NEXT: s_mov_b32 s2, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s0, s3, s2 -; GCN-IR-NEXT: s_and_b32 s3, s7, s8 -; GCN-IR-NEXT: s_and_b32 s2, s6, s2 -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 -; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 +; GCN-IR-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GCN-IR-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[0:1], 0 +; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s4 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s5 ; GCN-IR-NEXT: s_min_u32 s10, s6, s7 ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 @@ -842,40 +833,39 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s12 -; GCN-IR-NEXT: s_add_u32 s16, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 -; GCN-IR-NEXT: s_not_b64 s[0:1], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s0, s14 -; GCN-IR-NEXT: s_mov_b32 s15, s11 -; GCN-IR-NEXT: s_addc_u32 s9, s1, s11 +; GCN-IR-NEXT: s_add_u32 s15, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] +; GCN-IR-NEXT: s_add_u32 s8, s2, s14 +; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s1, 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 31 +; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s0, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s0, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s0, 31 +; GCN-IR-NEXT: s_sub_u32 s2, s15, s12 +; GCN-IR-NEXT: s_subb_u32 s2, s16, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s0, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] +; GCN-IR-NEXT: s_and_b32 s2, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[0:1] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: s_branch .LBB7_6 ; GCN-IR-NEXT: .LBB7_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 @@ -883,10 +873,10 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] ; GCN-IR-NEXT: .LBB7_6: ; %udiv-end -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 @@ -908,58 +898,57 @@ ; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1092,7 +1081,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1105,18 +1093,18 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v7, v2, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1138,7 +1126,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1189,22 +1177,21 @@ ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1215,10 +1202,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1251,12 +1238,12 @@ ; GCN-IR-NEXT: .LBB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 ; GCN-IR-NEXT: .LBB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result @@ -1355,8 +1342,8 @@ ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -1365,25 +1352,24 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -1397,7 +1383,7 @@ ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -1405,7 +1391,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -1421,7 +1407,7 @@ ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v1, 24 @@ -1536,7 +1522,6 @@ ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1550,16 +1535,16 @@ ; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v9, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v3, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1571,16 +1556,16 @@ ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v9, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1596,7 +1581,7 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -346,9 +346,8 @@ } ; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: -; SI-DAG: v_rcp_iflag_f32 -; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], +; SI: v_rcp_iflag_f32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, ; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -365,9 +364,8 @@ } ; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: -; SI-DAG: v_rcp_iflag_f32 -; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], +; SI: v_rcp_iflag_f32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, ; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -17,72 +17,71 @@ ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 @@ -161,9 +160,8 @@ ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 ; GCN-IR-NEXT: s_add_u32 s10, s6, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s7, s11 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -229,7 +227,6 @@ ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -245,7 +242,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -253,7 +250,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -275,7 +272,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -291,7 +288,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 @@ -332,35 +329,33 @@ ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v5, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[5:6] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -371,36 +366,36 @@ ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v7, v8 -; GCN-IR-NEXT: v_not_b32_e32 v6, v9 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v10 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v6, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_not_b32_e32 v6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -731,58 +726,57 @@ ; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -915,8 +909,8 @@ ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -925,63 +919,62 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 ; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 @@ -1111,7 +1104,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1124,18 +1116,18 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v7, v2, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1157,7 +1149,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1203,26 +1195,25 @@ ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffd0, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1233,10 +1224,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1268,15 +1259,15 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -31,9 +31,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -173,13 +173,12 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GISEL-NEXT: s_movk_i32 s0, 0x7fff -; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 -; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GISEL-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GISEL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -232,7 +231,7 @@ ; GISEL-NEXT: s_mov_b32 s0, 0x8000 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_add_f16_e64 v0, s0, -v0 +; GISEL-NEXT: v_add_f16_e64 v0, 0x8000, -v0 ; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -967,13 +967,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v1 -; GFX10-NEXT: v_and_b32_e32 v3, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1384,7 +1383,7 @@ ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -190,8 +190,8 @@ ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]] -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -51,9 +51,8 @@ ; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] -; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] +; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}} +; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}} ; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]] ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]