diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -146,30 +146,6 @@ return AMDGPU::INSTRUCTION_LIST_END; } -// Wrapper around isInlineConstant that understands special cases when -// instruction types are replaced during operand folding. -static bool isInlineConstantIfFolded(const SIInstrInfo *TII, - const MachineInstr &UseMI, - unsigned OpNo, - const MachineOperand &OpToFold) { - if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) - return true; - - unsigned Opc = UseMI.getOpcode(); - unsigned NewOpc = macToMad(Opc); - if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { - // Special case for mac. Since this is replaced with mad when folded into - // src2, we need to check the legality for the final instruction. - int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - if (static_cast(OpNo) == Src2Idx) { - const MCInstrDesc &MadDesc = TII->get(NewOpc); - return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); - } - } - - return false; -} - // TODO: Add heuristic that the frame index might not fit in the addressing mode // immediate offset to avoid materializing in loops. static bool frameIndexMayFold(const SIInstrInfo *TII, @@ -1235,59 +1211,13 @@ } } - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - for (auto &Use : - make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { - MachineInstr *UseMI = Use.getParent(); - unsigned OpNo = UseMI->getOperandNo(&Use); - - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &Use; - NonInlineUseOpNo = OpNo; - } - } - } - - if (NumLiteralUses == 1) { - MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); - } - } else { - // Folding register. - SmallVector UsesToProcess; - for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) - UsesToProcess.push_back(&Use); - for (auto U : UsesToProcess) { - MachineInstr *UseMI = U->getParent(); - - foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), - FoldList, CopiesToReplace); - } + SmallVector UsesToProcess; + for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) + UsesToProcess.push_back(&Use); + for (auto U : UsesToProcess) { + MachineInstr *UseMI = U->getParent(); + foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, + CopiesToReplace); } MachineFunction *MF = MI.getParent()->getParent(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -127,9 +127,8 @@ ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xffc0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0 +; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -209,14 +208,12 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff -; GFX8-NEXT: s_mov_b32 s1, 0xffc0 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -243,13 +240,12 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -276,13 +272,12 @@ ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -310,15 +305,14 @@ ; ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -349,15 +343,14 @@ ; GFX8-LABEL: s_add_v2i16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -391,15 +384,14 @@ ; GFX8-LABEL: s_add_v2i16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -422,9 +414,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) { ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0x80008000 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s1 @@ -434,26 +425,23 @@ ; ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0x80008000 -; GFX8-NEXT: s_xor_b32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x80008000 -; GFX10-NEXT: s_xor_b32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -429,13 +429,12 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog @@ -487,13 +485,12 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_and_b32 s1, s6, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: @@ -630,18 +626,17 @@ ; GFX6-LABEL: s_andn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -673,18 +668,17 @@ ; GFX6-LABEL: s_andn2_v4i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -716,18 +710,17 @@ ; GFX6-LABEL: s_andn2_v4i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s14, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, s14 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s14 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, s14 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, s14 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, s14 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, s14 +; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -790,24 +790,22 @@ ; GFX6-LABEL: s_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s3 ; GFX6-NEXT: s_ashr_i32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s3 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s3 -; GFX8-NEXT: s_ashr_i32 s2, s2, s4 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s3, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_ashr_i32 s2, s2, s3 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff @@ -882,12 +880,12 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: ashr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 @@ -990,45 +988,42 @@ ; GFX6-LABEL: s_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, s4 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s6 ; GFX6-NEXT: s_ashr_i32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s5, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s5 -; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s7, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, s5 -; GFX8-NEXT: s_sext_i32_i16 s8, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, s5 -; GFX8-NEXT: s_ashr_i32 s4, s4, s7 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s6, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s7, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_ashr_i32 s4, s4, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_ashr_i32 s2, s6, s8 +; GFX8-NEXT: s_ashr_i32 s2, s5, s7 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s4, s4, s3 +; GFX8-NEXT: s_and_b32 s3, s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1187,79 +1182,76 @@ ; GFX6-LABEL: s_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s16, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s9 ; GFX6-NEXT: s_ashr_i32 s0, s0, s8 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s10 ; GFX6-NEXT: s_ashr_i32 s3, s3, s11 ; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, s13 ; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s16 -; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, s12 ; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_ashr_i32 s7, s7, s15 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s9, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s9 -; GFX8-NEXT: s_sext_i32_i16 s10, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s9 -; GFX8-NEXT: s_sext_i32_i16 s12, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, s9 -; GFX8-NEXT: s_sext_i32_i16 s13, s4 -; GFX8-NEXT: s_bfe_i32 s4, s4, s9 -; GFX8-NEXT: s_sext_i32_i16 s14, s5 -; GFX8-NEXT: s_bfe_i32 s5, s5, s9 -; GFX8-NEXT: s_sext_i32_i16 s16, s7 -; GFX8-NEXT: s_bfe_i32 s7, s7, s9 -; GFX8-NEXT: s_sext_i32_i16 s11, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, s9 -; GFX8-NEXT: s_sext_i32_i16 s15, s6 -; GFX8-NEXT: s_bfe_i32 s6, s6, s9 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s9, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s12, s4 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s13, s5 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s10, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s14, s6 +; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s10, s14 +; GFX8-NEXT: s_ashr_i32 s4, s9, s13 ; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 -; GFX8-NEXT: s_mov_b32 s7, 0xffff -; GFX8-NEXT: s_ashr_i32 s5, s11, s15 +; GFX8-NEXT: s_sext_i32_i16 s11, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s15, s7 +; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 +; GFX8-NEXT: s_ashr_i32 s5, s10, s14 ; GFX8-NEXT: s_ashr_i32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s4, s7 -; GFX8-NEXT: s_ashr_i32 s8, s8, s13 -; GFX8-NEXT: s_ashr_i32 s6, s12, s16 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_ashr_i32 s8, s8, s12 +; GFX8-NEXT: s_ashr_i32 s6, s11, s15 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 ; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s4, s5, s7 +; GFX8-NEXT: s_and_b32 s4, s5, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s8, s8, s7 +; GFX8-NEXT: s_and_b32 s7, s8, 0xffff ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s4, s6, s7 -; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_or_b32 s0, s0, s7 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -111,9 +111,8 @@ ; ; GFX10-LABEL: s_bswap_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s0, s2 -; GFX10-NEXT: v_perm_b32 v1, 0, s1, s2 +; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s1, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -154,9 +153,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, v0, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 ; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) ret <2 x i32> %bswap @@ -200,9 +198,8 @@ ; ; GFX10-LABEL: s_bswap_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s1, s2 -; GFX10-NEXT: v_perm_b32 v1, 0, s0, s2 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -246,9 +243,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v2, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v2, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i64 @llvm.bswap.i64(i64 %src) @@ -313,11 +309,10 @@ ; ; GFX10-LABEL: s_bswap_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s1, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, s0, s4 -; GFX10-NEXT: v_perm_b32 v2, 0, s3, s4 -; GFX10-NEXT: v_perm_b32 v3, 0, s2, s4 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, s3, 0x10203 +; GFX10-NEXT: v_perm_b32 v3, 0, s2, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -376,11 +371,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v4, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v5, 0, v3, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 -; GFX10-NEXT: v_perm_b32 v3, 0, v2, s4 +; GFX10-NEXT: v_perm_b32 v4, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v5, 0, v3, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 +; GFX10-NEXT: v_perm_b32 v3, 0, v2, 0x10203 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -457,12 +451,11 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-LABEL: s_bswap_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s3, 0x80008 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, s3 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_lshl_b32 s2, s1, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s3 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 @@ -647,9 +640,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v2, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x10203 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i64 %src to i48 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -763,25 +763,22 @@ ; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s7, 0x80008 -; GCN-NEXT: s_movk_i32 s5, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s8, s0, s7 -; GCN-NEXT: s_and_b32 s6, s0, s5 -; GCN-NEXT: s_lshl_b32 s8, s8, 8 -; GCN-NEXT: s_or_b32 s6, s6, s8 -; GCN-NEXT: s_mov_b32 s8, 0x80010 +; GCN-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s2, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s8 +; GCN-NEXT: s_and_b32 s5, s0, 0xff +; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s6, s0 +; GCN-NEXT: s_or_b32 s0, s5, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 24 -; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s1, s5 -; GCN-NEXT: s_bfe_u32 s5, s1, s7 +; GCN-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s1, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 @@ -798,32 +795,29 @@ ; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_lshr_b32 s2, s4, 2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s10, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s10, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s8, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s8 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s8, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: s_cselect_b32 s0, s1, s0 ; GFX10-NEXT: s_and_b32 s1, s4, 3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 @@ -934,7 +928,6 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -942,9 +935,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 @@ -1063,7 +1056,6 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1071,9 +1063,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s6, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s6, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 @@ -1093,27 +1085,26 @@ ; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s6, 0x80008 -; GCN-NEXT: s_movk_i32 s4, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s7, s0, s6 -; GCN-NEXT: s_and_b32 s5, s0, s4 -; GCN-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NEXT: s_or_b32 s5, s5, s7 -; GCN-NEXT: s_mov_b32 s7, 0x80010 +; GCN-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s2, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s7 +; GCN-NEXT: s_and_b32 s4, s0, 0xff +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s5, s0 +; GCN-NEXT: s_or_b32 s0, s4, s0 ; GCN-NEXT: s_lshl_b32 s2, s2, 24 -; GCN-NEXT: s_or_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s2, s1, s4 -; GCN-NEXT: s_bfe_u32 s4, s1, s6 +; GCN-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s1, 0xff ; GCN-NEXT: s_lshl_b32 s4, s4, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s7 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 @@ -1121,9 +1112,7 @@ ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -1131,33 +1120,30 @@ ; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s4, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s3, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, 8 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s7, s4 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s5, 24 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s3, s0 ; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo @@ -2089,45 +2075,42 @@ ; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s11, 0x80008 -; GCN-NEXT: s_movk_i32 s9, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s12, s0, s11 -; GCN-NEXT: s_and_b32 s10, s0, s9 -; GCN-NEXT: s_lshl_b32 s12, s12, 8 -; GCN-NEXT: s_or_b32 s10, s10, s12 -; GCN-NEXT: s_mov_b32 s12, 0x80010 +; GCN-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s5, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s12 +; GCN-NEXT: s_and_b32 s9, s0, 0xff +; GCN-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s9, s9, s10 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s10, s0 +; GCN-NEXT: s_or_b32 s0, s9, s0 ; GCN-NEXT: s_lshl_b32 s5, s5, 24 -; GCN-NEXT: s_bfe_u32 s10, s1, s11 +; GCN-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s6, s1, 24 ; GCN-NEXT: s_or_b32 s0, s0, s5 -; GCN-NEXT: s_and_b32 s5, s1, s9 -; GCN-NEXT: s_lshl_b32 s10, s10, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s12 -; GCN-NEXT: s_or_b32 s5, s5, s10 +; GCN-NEXT: s_and_b32 s5, s1, 0xff +; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s5, s5, s9 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s5, s1 ; GCN-NEXT: s_lshl_b32 s5, s6, 24 -; GCN-NEXT: s_bfe_u32 s6, s2, s11 +; GCN-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GCN-NEXT: s_lshr_b32 s7, s2, 24 ; GCN-NEXT: s_or_b32 s1, s1, s5 -; GCN-NEXT: s_and_b32 s5, s2, s9 +; GCN-NEXT: s_and_b32 s5, s2, 0xff ; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_bfe_u32 s2, s2, s12 +; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s2, s5, s2 ; GCN-NEXT: s_lshl_b32 s5, s7, 24 -; GCN-NEXT: s_bfe_u32 s6, s3, s11 +; GCN-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GCN-NEXT: s_lshr_b32 s8, s3, 24 ; GCN-NEXT: s_or_b32 s2, s2, s5 -; GCN-NEXT: s_and_b32 s5, s3, s9 +; GCN-NEXT: s_and_b32 s5, s3, 0xff ; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_bfe_u32 s3, s3, s12 +; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -2148,50 +2131,47 @@ ; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshl_b32 s17, s17, 8 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_and_b32 s5, s3, s5 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 +; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s9, s0, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s8, s16, s17 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_or_b32 s10, s11, s12 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s10, s1 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s9, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_or_b32 s11, s13, s14 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s5, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s11, s2 +; GFX10-NEXT: s_or_b32 s6, s7, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s8, s10, 24 -; GFX10-NEXT: s_or_b32 s3, s5, s3 -; GFX10-NEXT: s_lshl_b32 s5, s11, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_or_b32 s3, s6, s3 +; GFX10-NEXT: s_lshl_b32 s5, s8, 24 ; GFX10-NEXT: s_lshr_b32 s6, s4, 2 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s8 ; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b32 s0, s1, s0 @@ -2371,37 +2351,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_mov_b32_e32 v6, 16 +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7 -; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v12, v7 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9 -; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX10-NEXT: v_or3_b32 v2, v2, v14, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v3, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 ; GFX10-NEXT: v_or3_b32 v1, v4, v3, v5 @@ -2583,43 +2561,41 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: v_mov_b32_e32 v0, 0xff -; GFX10-NEXT: v_mov_b32_e32 v7, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, v0, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GFX10-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -2633,47 +2609,46 @@ ; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s10, 0x80008 -; GCN-NEXT: s_movk_i32 s8, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s11, s0, s10 -; GCN-NEXT: s_and_b32 s9, s0, s8 -; GCN-NEXT: s_lshl_b32 s11, s11, 8 -; GCN-NEXT: s_or_b32 s9, s9, s11 -; GCN-NEXT: s_mov_b32 s11, 0x80010 +; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s4, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s11 +; GCN-NEXT: s_and_b32 s8, s0, 0xff +; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s8, s8, s9 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s9, s0 +; GCN-NEXT: s_or_b32 s0, s8, s0 ; GCN-NEXT: s_lshl_b32 s4, s4, 24 -; GCN-NEXT: s_bfe_u32 s9, s1, s10 +; GCN-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s5, s1, 24 ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: s_and_b32 s4, s1, s8 -; GCN-NEXT: s_lshl_b32 s9, s9, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s11 -; GCN-NEXT: s_or_b32 s4, s4, s9 +; GCN-NEXT: s_and_b32 s4, s1, 0xff +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s4, s4, s8 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s4, s1 ; GCN-NEXT: s_lshl_b32 s4, s5, 24 -; GCN-NEXT: s_bfe_u32 s5, s2, s10 +; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GCN-NEXT: s_lshr_b32 s6, s2, 24 ; GCN-NEXT: s_or_b32 s1, s1, s4 -; GCN-NEXT: s_and_b32 s4, s2, s8 +; GCN-NEXT: s_and_b32 s4, s2, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s2, s2, s11 +; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s2, s4, s2 ; GCN-NEXT: s_lshl_b32 s4, s6, 24 -; GCN-NEXT: s_bfe_u32 s5, s3, s10 +; GCN-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GCN-NEXT: s_lshr_b32 s7, s3, 24 ; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_and_b32 s4, s3, s8 +; GCN-NEXT: s_and_b32 s4, s3, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s3, s3, s11 +; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s4, s3 @@ -2687,9 +2662,7 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -2697,58 +2670,55 @@ ; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s5, 0x80008 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_mov_b32 s6, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s14, s1, s5 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s4 -; GFX10-NEXT: s_and_b32 s13, s1, s4 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s12, s12, 8 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s11, s11, s12 -; GFX10-NEXT: s_or_b32 s12, s13, s14 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s1, s12, s1 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_or_b32 s9, s10, s11 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_or_b32 s1, s1, s5 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_bfe_u32 s16, s2, s5 +; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_and_b32 s15, s2, s4 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s7, s15, s16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 24 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_bfe_u32 s5, s3, s5 +; GFX10-NEXT: s_or_b32 s10, s12, s13 +; GFX10-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: s_or_b32 s2, s7, s2 -; GFX10-NEXT: s_lshl_b32 s7, s9, 24 -; GFX10-NEXT: s_and_b32 s4, s3, s4 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s4, s6, 24 +; GFX10-NEXT: s_and_b32 s6, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_bfe_u32 s1, s3, s6 -; GFX10-NEXT: s_or_b32 s2, s2, s7 -; GFX10-NEXT: s_lshr_b32 s10, s3, 24 -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s7, s3, 24 +; GFX10-NEXT: s_or_b32 s3, s6, s5 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 ; GFX10-NEXT: s_or_b32 s0, s3, s1 -; GFX10-NEXT: s_lshl_b32 s1, s10, 24 +; GFX10-NEXT: s_lshl_b32 s1, s7, 24 ; GFX10-NEXT: s_or_b32 s3, s0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -901,19 +901,17 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 @@ -1334,19 +1332,17 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-IEEE-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s6, |v0|, s4 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s4, |v1|, s4 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, s5, s6 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, s5, s4 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0| +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,19 +1488,17 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -79,12 +79,11 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 4 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -127,12 +126,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, s32 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -259,14 +257,13 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x104 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -314,16 +311,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -431,14 +428,13 @@ ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x4004 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -486,16 +482,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -325,10 +325,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s2, 0x80000000 -; GFX10-NEXT: v_sub_f32_e32 v1, s2, v1 -; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| ; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -447,11 +445,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s2, 0x80000000 -; GFX10-NEXT: v_sub_f32_e64 v1, s2, |v1| -; GFX10-NEXT: v_sub_f32_e64 v2, s2, |v2| -; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1| +; GFX10-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2| +; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -473,9 +473,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x80008000 -; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX10-NEXT: v_log_f16_e32 v2, v0 ; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -9,8 +9,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s2, s2, 0x7f ; GFX6-NEXT: s_movk_i32 s3, 0x7f -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -39,9 +39,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s2, s2, 0x7f +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: s_movk_i32 s3, 0x7f -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -71,9 +71,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7f +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: s_movk_i32 s3, 0x7f -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -102,11 +102,10 @@ ; GFX10-LABEL: s_fshl_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 -; GFX10-NEXT: s_movk_i32 s3, 0x7f -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_and_b32 s2, s2, 0x7f +; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -123,8 +122,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -250,10 +249,9 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -541,68 +539,65 @@ ; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: s_lshl_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s4 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_or_b32 s1, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 +; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s4, s6 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s6 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 +; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s4, s6 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_and_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -610,28 +605,27 @@ ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s4, s4, s6 -; GFX10-NEXT: s_and_b32 s7, s2, 7 -; GFX10-NEXT: s_and_b32 s1, s1, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s6, s2, 7 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_and_b32 s7, s5, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshr_b32 s4, s4, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, s6 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -729,20 +723,20 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 @@ -785,22 +779,21 @@ ; GFX6-NEXT: s_bfe_u32 s4, s1, 0x80010 ; GFX6-NEXT: s_andn2_b32 s6, 7, s7 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_movk_i32 s10, 0xff ; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 -; GFX6-NEXT: s_and_b32 s2, s2, s10 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_or_b32 s1, s4, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s3, s10 +; GFX6-NEXT: s_and_b32 s2, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s10 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -808,11 +801,10 @@ ; ; GFX8-LABEL: s_fshl_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s13, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 @@ -828,7 +820,7 @@ ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 @@ -836,37 +828,36 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshr_b32 s4, s4, 1 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 ; GFX8-NEXT: s_lshr_b32 s5, s8, 1 -; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s13, 0xff ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 @@ -882,7 +873,7 @@ ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 @@ -890,79 +881,78 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 ; GFX9-NEXT: s_lshr_b32 s5, s8, 1 -; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshr_b32 s4, s5, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_and_b32 s1, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s11, 0xff ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s12, s2, 24 -; GFX10-NEXT: s_and_b32 s13, s2, 7 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, s11 +; GFX10-NEXT: s_and_b32 s2, s6, 0xff ; GFX10-NEXT: s_and_b32 s6, s9, 7 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s13 +; GFX10-NEXT: s_lshl_b32 s0, s0, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s2, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, s11 +; GFX10-NEXT: s_and_b32 s2, s7, 0xff ; GFX10-NEXT: s_and_b32 s3, s10, 7 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s12, 7 -; GFX10-NEXT: s_andn2_b32 s6, 7, s12 +; GFX10-NEXT: s_and_b32 s4, s11, 7 +; GFX10-NEXT: s_andn2_b32 s6, 7, s11 ; GFX10-NEXT: s_lshr_b32 s7, s8, 1 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s7, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, s11 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s11 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s11 +; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1136,51 +1126,50 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v11, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 ; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v10, v12 +; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1199,8 +1188,8 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX6-NEXT: s_mov_b32 s3, 0xffffff -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1230,8 +1219,8 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX8-NEXT: s_mov_b32 s3, 0xffffff -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1261,8 +1250,8 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX9-NEXT: s_mov_b32 s3, 0xffffff -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1289,9 +1278,8 @@ ; GFX10-LABEL: s_fshl_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x170001 -; GFX10-NEXT: s_and_b32 s2, s2, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1308,8 +1296,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1433,11 +1421,10 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1449,25 +1436,23 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xff -; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 -; GFX6-NEXT: s_and_b32 s10, s0, s9 -; GFX6-NEXT: s_bfe_u32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1476,20 +1461,20 @@ ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 -; GFX6-NEXT: s_and_b32 s10, s2, s9 -; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s10, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s10, s2 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1498,10 +1483,10 @@ ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 ; GFX6-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NEXT: s_and_b32 s10, s4, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_and_b32 s10, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s10, s4 @@ -1515,14 +1500,14 @@ ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: s_lshr_b32 s8, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX6-NEXT: s_and_b32 s5, s5, s9 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1555,6 +1540,7 @@ ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1579,25 +1565,24 @@ ; GFX8-LABEL: s_fshl_v2i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: s_movk_i32 s10, 0xff -; GFX8-NEXT: s_and_b32 s6, s6, s10 -; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_and_b32 s0, s0, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1606,23 +1591,23 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1630,13 +1615,13 @@ ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1649,14 +1634,14 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1698,7 +1683,7 @@ ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1712,27 +1697,26 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: s_movk_i32 s12, 0xff -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_bfe_u32 s13, 8, 0x100000 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_and_b32 s0, s0, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1740,24 +1724,24 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 8 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s3, s10, s3 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1765,14 +1749,14 @@ ; GFX9-NEXT: s_or_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s7, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 @@ -1780,10 +1764,10 @@ ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 @@ -1822,10 +1806,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -1840,120 +1825,117 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s10, s1, 8 -; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s1, s1, s9 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s11 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshr_b32 s8, s4, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX10-NEXT: s_lshr_b32 s7, s4, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_and_b32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s6, s6, s11 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s12, s4, 24 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s10, s9 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, s10 +; GFX10-NEXT: s_lshr_b32 s13, s5, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s11, 0xff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s5, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_and_b32 s8, s10, s9 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_lshl_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s8, s13, s9 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s13, 0xff ; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s7 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s8 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_and_b32 s12, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_and_b32 s10, s10, s9 -; GFX10-NEXT: s_or_b32 s8, s12, s8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 24 ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s10, 0x100000 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-NEXT: s_and_b32 s5, s9, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s8, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s9 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s11 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s8, s9 -; GFX10-NEXT: s_mov_b32 s5, 0xffffff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: s_lshr_b32 s2, s3, 1 ; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v4, s2 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 @@ -2141,12 +2123,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 ; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2177,12 +2158,12 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 -; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 @@ -3176,19 +3157,18 @@ ; GFX6-LABEL: s_fshl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s6, s4, 15 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, s6 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, s6 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3229,22 +3209,20 @@ ; ; GFX9-LABEL: s_fshl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000f -; GFX9-NEXT: s_and_b32 s4, s2, s3 -; GFX9-NEXT: s_andn2_b32 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s5 -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s3, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_andn2_b32 s2, 0xf000f, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, s4 @@ -3254,22 +3232,20 @@ ; ; GFX10-LABEL: s_fshl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_mov_b32 s3, 0xf000f -; GFX10-NEXT: s_and_b32 s7, s1, s5 +; GFX10-NEXT: s_and_b32 s6, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_lshr_b32 s7, s7, 0x10001 +; GFX10-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s6, s6, 0x10001 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_and_b32 s4, s2, s3 -; GFX10-NEXT: s_andn2_b32 s2, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s7, s1 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -3347,10 +3323,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3420,19 +3395,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_mov_b32 s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, s0 +; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_bfe_u32 s0, s3, s0 +; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3488,13 +3462,12 @@ ; GFX10-LABEL: v_fshl_v2i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshr_b32 s2, s1, 16 ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 0x10001 -; GFX10-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf000f, v1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s1 @@ -3556,13 +3529,12 @@ ; ; GFX9-LABEL: v_fshl_v2i16_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s4 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, s1, v0 @@ -3571,15 +3543,14 @@ ; ; GFX10-LABEL: v_fshl_v2i16_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xf000f ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_and_b32 s3, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, s1, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s1, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3592,19 +3563,18 @@ ; GFX6-LABEL: v_fshl_v2i16_vss: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s0, s0, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, s4 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3642,18 +3612,16 @@ ; ; GFX9-LABEL: v_fshl_v2i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s0, s0, 0x10001 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: s_lshr_b32 s1, s2, s3 @@ -3663,18 +3631,16 @@ ; ; GFX10-LABEL: v_fshl_v2i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_and_b32 s5, s0, s3 +; GFX10-NEXT: s_and_b32 s3, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s5, 0x10001 +; GFX10-NEXT: s_lshr_b32 s3, s3, 0x10001 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s5, s0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s1, s2, s3 @@ -3704,19 +3670,18 @@ ; GFX6-LABEL: s_fshl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s12, s8, 15 -; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_lshl_b32 s0, s0, s12 -; GFX6-NEXT: s_mov_b32 s12, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, s12 +; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s12 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s9 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, s12 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3724,7 +3689,7 @@ ; GFX6-NEXT: s_andn2_b32 s5, 15, s10 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_bfe_u32 s4, s6, s12 +; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s2, s2, s4 @@ -3732,7 +3697,7 @@ ; GFX6-NEXT: s_andn2_b32 s5, 15, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s7, s12 +; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -3801,42 +3766,39 @@ ; ; GFX9-LABEL: s_fshl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s6, 0xf000f -; GFX9-NEXT: s_and_b32 s7, s4, s6 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s7, s9, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_mov_b32 s8, 0x10001 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: s_andn2_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s6, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 0x10001 +; GFX9-NEXT: s_lshr_b32 s6, s6, 1 +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s7, s10 +; GFX9-NEXT: s_lshr_b32 s4, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s2, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 0x10001 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s3, s3, s5 @@ -3846,40 +3808,37 @@ ; ; GFX10-LABEL: s_fshl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s10, 0xffff -; GFX10-NEXT: s_mov_b32 s6, 0xf000f -; GFX10-NEXT: s_mov_b32 s8, 0x10001 -; GFX10-NEXT: s_and_b32 s12, s2, s10 +; GFX10-NEXT: s_and_b32 s9, s2, 0xffff ; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_and_b32 s7, s4, s6 -; GFX10-NEXT: s_lshr_b32 s12, s12, s8 +; GFX10-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX10-NEXT: s_lshr_b32 s9, s9, 0x10001 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_lshr_b32 s11, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s7, s9, s11 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s10 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s6, s7, s8 +; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s4, s9, s11 -; GFX10-NEXT: s_and_b32 s9, s3, s10 +; GFX10-NEXT: s_lshr_b32 s4, s7, s8 +; GFX10-NEXT: s_and_b32 s8, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s4, s5, s6 -; GFX10-NEXT: s_lshr_b32 s8, s9, s8 +; GFX10-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX10-NEXT: s_lshr_b32 s8, s8, 0x10001 ; GFX10-NEXT: s_lshr_b32 s3, s3, 1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_andn2_b32 s5, s6, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s5, 0xf000f, s5 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s4, s6, s7 ; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s7, s5, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s5, s6, s7 @@ -4003,16 +3962,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -4893,14 +4851,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x7f -; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_and_b32_e32 v18, s4, v8 +; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 31, v6 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 @@ -5104,44 +5061,43 @@ ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s9, 0x7f -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v12, s9, v0 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: v_and_b32_e32 v13, s9, v4 +; GFX10-NEXT: s_lshl_b32 s9, s6, 31 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[6:7] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 @@ -6341,14 +6297,13 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s7, 0x7f -; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX10-NEXT: v_lshlrev_b32_e32 v21, 31, v10 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v9, v9, v21 ; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 @@ -6380,7 +6335,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 -; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 @@ -6389,7 +6344,7 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX10-NEXT: v_and_b32_e32 v25, s7, v3 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v3 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -9,12 +9,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s2, s2, 0x7f ; GFX6-NEXT: s_movk_i32 s3, 0x7f -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -40,12 +40,12 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_movk_i32 s3, 0x7f -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -71,12 +71,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_movk_i32 s3, 0x7f -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -101,10 +101,9 @@ ; GFX10-LABEL: s_fshr_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 -; GFX10-NEXT: s_movk_i32 s3, 0x7f +; GFX10-NEXT: s_and_b32 s2, s2, 0x7f ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -121,8 +120,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 @@ -249,12 +248,11 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -530,10 +528,9 @@ ; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: s_and_b32 s5, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s6 +; GFX6-NEXT: s_and_b32 s2, s1, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s2, s5 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 7 @@ -543,8 +540,8 @@ ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -552,28 +549,27 @@ ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 7, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s2 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_andn2_b32 s5, 7, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, s5 -; GFX8-NEXT: s_lshr_b32 s1, s4, s1 -; GFX8-NEXT: s_or_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -581,28 +577,27 @@ ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 7, s5 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s4, s4, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_andn2_b32 s5, 7, s5 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s3, s5 -; GFX9-NEXT: s_lshr_b32 s1, s4, s1 -; GFX9-NEXT: s_or_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s3, s1 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -610,14 +605,13 @@ ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_movk_i32 s7, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 @@ -629,9 +623,9 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, s7 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -729,18 +723,18 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 @@ -768,10 +762,9 @@ ; GFX6-NEXT: s_lshr_b32 s9, s2, 24 ; GFX6-NEXT: s_and_b32 s10, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_movk_i32 s11, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s11 +; GFX6-NEXT: s_and_b32 s2, s1, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s7, 7 @@ -792,24 +785,23 @@ ; GFX6-NEXT: s_and_b32 s3, s9, 7 ; GFX6-NEXT: s_andn2_b32 s4, 7, s9 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_and_b32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s3, s6, s3 -; GFX6-NEXT: s_and_b32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s1, s1, s11 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s11 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s13, 0xff ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -822,14 +814,14 @@ ; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -837,33 +829,32 @@ ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshl_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s3, s4, s3 -; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: s_and_b32 s3, s11, 7 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s5, s5, 1 -; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s8, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s13, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -876,14 +867,14 @@ ; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -891,26 +882,26 @@ ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 -; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshr_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s2, s3, s2 ; GFX9-NEXT: s_and_b32 s3, s11, 7 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 -; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s3, s8, s3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_and_b32 s1, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -918,7 +909,6 @@ ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_movk_i32 s13, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 @@ -929,9 +919,9 @@ ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s6, s6, s13 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 @@ -941,7 +931,7 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_and_b32 s6, s7, s13 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s10, 7 @@ -956,14 +946,14 @@ ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, s13 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s13 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s13 +; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1136,52 +1126,51 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 -; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v8 +; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 +; GFX10-NEXT: v_lshrrev_b16 v8, v12, v9 +; GFX10-NEXT: v_lshrrev_b16 v2, v2, v7 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX10-NEXT: v_or_b32_e32 v4, v6, v8 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1200,12 +1189,12 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX6-NEXT: s_mov_b32 s3, 0xffffff -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1232,12 +1221,12 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX8-NEXT: s_mov_b32 s3, 0xffffff -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -1264,11 +1253,11 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX9-NEXT: s_mov_b32 s3, 0xffffff -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1293,10 +1282,9 @@ ; GFX10-LABEL: s_fshr_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1313,8 +1301,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1425,6 +1413,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 @@ -1432,9 +1421,7 @@ ; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 @@ -1444,8 +1431,8 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1458,38 +1445,36 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xff -; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_and_b32 s10, s0, s9 -; GFX6-NEXT: s_bfe_u32 s0, s0, s11 -; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 -; GFX6-NEXT: s_and_b32 s7, s8, s9 +; GFX6-NEXT: s_and_b32 s7, s8, 0xff ; GFX6-NEXT: s_lshr_b32 s8, s2, 16 ; GFX6-NEXT: s_lshr_b32 s10, s2, 24 -; GFX6-NEXT: s_and_b32 s13, s2, s9 -; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s8, s8, s9 -; GFX6-NEXT: s_or_b32 s2, s13, s2 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff +; GFX6-NEXT: s_or_b32 s2, s12, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshr_b32 s12, s3, 8 +; GFX6-NEXT: s_lshr_b32 s11, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: s_and_b32 s8, s11, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s10, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1498,13 +1483,13 @@ ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 ; GFX6-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NEXT: s_and_b32 s13, s4, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_and_b32 s12, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NEXT: s_and_b32 s8, s8, s9 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s13, s4 +; GFX6-NEXT: s_or_b32 s4, s12, s4 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1513,16 +1498,16 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NEXT: s_lshr_b32 s11, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX6-NEXT: s_and_b32 s5, s5, s9 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: s_and_b32 s8, s11, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s10, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1534,7 +1519,7 @@ ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 @@ -1564,6 +1549,7 @@ ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1589,55 +1575,54 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 -; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 -; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 -; GFX8-NEXT: s_and_b32 s7, s9, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff +; GFX8-NEXT: s_and_b32 s7, s9, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, s10 -; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX8-NEXT: s_lshr_b32 s13, s3, 8 +; GFX8-NEXT: s_lshr_b32 s12, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 -; GFX8-NEXT: s_and_b32 s8, s13, s10 -; GFX8-NEXT: s_or_b32 s3, s12, s3 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff +; GFX8-NEXT: s_or_b32 s3, s11, s3 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s12, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, s10 -; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1648,18 +1633,18 @@ ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: s_lshr_b32 s13, s5, 8 +; GFX8-NEXT: s_lshr_b32 s12, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_or_b32 s5, s12, s5 +; GFX8-NEXT: s_or_b32 s5, s11, s5 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 @@ -1707,7 +1692,7 @@ ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1721,45 +1706,44 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s13, 8, 0x100000 -; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_and_b32 s7, s7, s12 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 -; GFX9-NEXT: s_and_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 -; GFX9-NEXT: s_and_b32 s9, s11, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff +; GFX9-NEXT: s_and_b32 s9, s11, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s14, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, s12 -; GFX9-NEXT: s_lshl_b32 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_and_b32 s10, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s15, s3, 8 +; GFX9-NEXT: s_lshr_b32 s14, s3, 8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 -; GFX9-NEXT: s_and_b32 s10, s15, s12 -; GFX9-NEXT: s_or_b32 s3, s14, s3 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s3, s13, s3 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -1767,25 +1751,25 @@ ; GFX9-NEXT: s_or_b32 s3, s3, s10 ; GFX9-NEXT: s_lshr_b32 s10, s4, 8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s14, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: s_lshl_b32 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s10, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s15, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, s12 +; GFX9-NEXT: s_lshr_b32 s14, s5, 8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s13 -; GFX9-NEXT: s_and_b32 s10, s15, s12 -; GFX9-NEXT: s_or_b32 s5, s14, s5 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s5, s13, s5 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -1831,10 +1815,11 @@ ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -1849,122 +1834,119 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s4, 8 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s13, s4, 16 -; GFX10-NEXT: s_and_b32 s12, s12, s9 -; GFX10-NEXT: s_lshr_b32 s14, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, s10 -; GFX10-NEXT: s_and_b32 s13, s13, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_bfe_u32 s12, s13, 0x100000 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_lshr_b32 s8, s4, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_lshr_b32 s15, s5, 8 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s12, s15, s9 -; GFX10-NEXT: s_or_b32 s5, s14, s5 -; GFX10-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: s_lshr_b32 s11, s1, 8 +; GFX10-NEXT: s_lshr_b32 s11, s4, 24 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s8, s9, 0xff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_or_b32 s5, s5, s12 -; GFX10-NEXT: s_and_b32 s1, s1, s9 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s5, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s9 +; GFX10-NEXT: s_and_b32 s8, s12, 0xff +; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s6, s6, s10 -; GFX10-NEXT: s_and_b32 s8, s8, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s11, s9 -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_lshr_b32 s13, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_lshr_b32 s12, s3, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s8, s11, s9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s9 ; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s3, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_and_b32 s5, s12, 0xff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_and_b32 s5, s12, s9 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_or_b32 s3, s13, s3 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s4, s6, 17 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s5, s6, 17 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: s_or_b32 s0, s4, s0 ; GFX10-NEXT: s_lshl_b32 s2, s7, 17 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX10-NEXT: s_or_b32 s0, s2, s1 ; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 @@ -2156,14 +2138,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2194,12 +2175,12 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 -; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v4, v7, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 @@ -3032,25 +3013,24 @@ ; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 ; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX6-NEXT: s_mov_b32 s6, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_bfe_u32 s7, s2, s6 -; GFX6-NEXT: s_bfe_u32 s8, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 -; GFX6-NEXT: s_bfe_u32 s5, s3, s6 -; GFX6-NEXT: s_lshr_b32 s7, s7, s8 -; GFX6-NEXT: s_lshr_b32 s5, s5, s8 +; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s7 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s7 +; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: s_and_b32 s7, s4, 15 +; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s2, s6 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s7 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 @@ -3058,7 +3038,7 @@ ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, s6 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3111,45 +3091,43 @@ ; ; GFX9-LABEL: s_fshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000f -; GFX9-NEXT: s_and_b32 s4, s2, s3 -; GFX9-NEXT: s_andn2_b32 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX9-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, s5 +; GFX9-NEXT: s_lshl_b32 s2, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0xf000f +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s5, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_andn2_b32 s2, s3, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s4, s2, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s2 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, s4 +; GFX10-NEXT: s_lshl_b32 s2, s3, s5 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s5, 16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s3, s3, s4 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10-NEXT: s_lshr_b32 s3, s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3252,10 +3230,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3326,35 +3303,34 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_mov_b32 s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s6, s2, s5 -; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_bfe_u32 s0, s2, s5 +; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s3, s5 +; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s4, s7 +; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_bfe_u32 s0, s3, s5 +; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3421,12 +3397,11 @@ ; GFX10-LABEL: v_fshr_v2i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX10-NEXT: s_lshl_b32 s2, s3, 1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf000f, v1 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 @@ -3514,36 +3489,34 @@ ; ; GFX9-LABEL: v_fshr_v2i16_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX9-NEXT: s_lshl_b32 s2, s2, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s3, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_v2i16_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_and_b32 s4, s1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: s_and_b32 s3, s1, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, s3, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3559,25 +3532,24 @@ ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: s_mov_b32 s4, 0xf0001 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_bfe_u32 s5, s0, s4 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s0, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 -; GFX6-NEXT: s_bfe_u32 s3, s1, s4 -; GFX6-NEXT: s_lshr_b32 s5, s5, s6 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_and_b32 s5, s2, 15 +; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 @@ -3585,7 +3557,7 @@ ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, s4 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3635,32 +3607,30 @@ ; ; GFX9-LABEL: v_fshr_v2i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_v2i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_and_b32 s3, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s1, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, s3 -; GFX10-NEXT: s_lshr_b32 s1, s2, s4 +; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3686,41 +3656,39 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s12, 0xffff ; GFX6-NEXT: s_lshl_b32 s9, s9, 16 -; GFX6-NEXT: s_and_b32 s8, s8, s12 +; GFX6-NEXT: s_and_b32 s8, s8, 0xffff ; GFX6-NEXT: s_or_b32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 -; GFX6-NEXT: s_and_b32 s10, s10, s12 -; GFX6-NEXT: s_mov_b32 s11, 0xf0001 +; GFX6-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NEXT: s_or_b32 s9, s9, s10 ; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s12, s4, s11 -; GFX6-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s11, s4, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s12, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s10 -; GFX6-NEXT: s_lshr_b32 s12, s12, s13 -; GFX6-NEXT: s_or_b32 s0, s0, s12 -; GFX6-NEXT: s_bfe_u32 s12, s5, s11 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 +; GFX6-NEXT: s_or_b32 s0, s0, s11 +; GFX6-NEXT: s_bfe_u32 s11, s5, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s12, s12, s13 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s12 +; GFX6-NEXT: s_or_b32 s1, s1, s11 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s12, s8, 16 -; GFX6-NEXT: s_and_b32 s14, s8, 15 +; GFX6-NEXT: s_lshr_b32 s11, s8, 16 +; GFX6-NEXT: s_and_b32 s13, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s14, s14, 0x100000 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s13 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s12, 15 +; GFX6-NEXT: s_and_b32 s4, s11, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_andn2_b32 s8, 15, s12 +; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, s11 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3729,12 +3697,12 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, s10 -; GFX6-NEXT: s_bfe_u32 s2, s6, s11 -; GFX6-NEXT: s_lshr_b32 s2, s2, s13 +; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s2, s2, s12 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, s10 -; GFX6-NEXT: s_bfe_u32 s3, s7, s11 -; GFX6-NEXT: s_lshr_b32 s3, s3, s13 +; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s3, s3, s12 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 @@ -3743,7 +3711,7 @@ ; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, s11 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 ; GFX6-NEXT: s_lshr_b32 s3, s3, s5 @@ -3752,7 +3720,7 @@ ; GFX6-NEXT: s_andn2_b32 s5, 15, s6 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, s11 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 @@ -3840,31 +3808,28 @@ ; ; GFX9-LABEL: s_fshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, 0x10001 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_mov_b32 s6, 0xf000f -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 -; GFX9-NEXT: s_lshl_b32 s9, s9, 1 -; GFX9-NEXT: s_and_b32 s7, s4, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s7, s7, 1 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s4, s9, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, s7 -; GFX9-NEXT: s_lshr_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s8 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 @@ -3873,7 +3838,7 @@ ; GFX9-NEXT: s_lshl_b32 s4, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s4, s5 @@ -3883,41 +3848,38 @@ ; ; GFX10-LABEL: s_fshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s7, 0x10001 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_mov_b32 s6, 0xf000f -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s8, s8, 1 -; GFX10-NEXT: s_and_b32 s9, s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX10-NEXT: s_andn2_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, s10 -; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s6, s8 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s7, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x10001 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s7, s5, s6 +; GFX10-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10-NEXT: s_lshr_b32 s6, s6, s8 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 s4, s6, s5 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 0xf000f ; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s8 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s4, s5, s6 +; GFX10-NEXT: s_lshl_b32 s4, s5, s7 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 -; GFX10-NEXT: s_lshr_b32 s9, s10, s11 -; GFX10-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_lshr_b32 s7, s6, 16 +; GFX10-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s5, s5, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: s_or_b32 s0, s0, s2 @@ -4090,16 +4052,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -5008,35 +4969,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: s_movk_i32 s4, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 -; GFX10-NEXT: v_and_b32_e32 v18, s4, v9 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v19 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 ; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v19 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v21, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 ; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v20, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 ; GFX10-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo @@ -5219,36 +5179,35 @@ ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_movk_i32 s10, 0x7f +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_and_b32_e32 v13, s10, v0 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: v_and_b32_e32 v12, s10, v1 ; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v12, s[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo @@ -6462,11 +6421,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 -; GFX10-NEXT: s_movk_i32 s5, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v26, s5, v16 +; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v25, s5, v17 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 @@ -6500,14 +6458,14 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v25, s5, v16 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX10-NEXT: v_and_b32_e32 v23, s5, v20 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v3, s4 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -9,14 +9,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s5, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s3, s4, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -27,14 +26,13 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s5, 1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s3, s4, s2 -; GFX8-NEXT: s_lshl_b32 s3, s3, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -45,14 +43,13 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s5, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s3, s4, s2 -; GFX7-NEXT: s_lshl_b32 s3, s3, s1 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s0, s0, s3 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -63,15 +60,14 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_and_b32 s3, s4, s2 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -87,13 +83,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -104,36 +99,34 @@ ; GFX8-LABEL: insertelement_v_v2i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s1, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s1, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -141,13 +134,12 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v2, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s3, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_and_b32 s1, s2, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 @@ -165,9 +157,8 @@ ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -182,9 +173,8 @@ ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 @@ -198,11 +188,10 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -215,11 +204,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_andn2_b32 s0, s0, s2 @@ -237,9 +225,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s2, s4, s1 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 @@ -254,9 +242,9 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s2, s4, s1 +; GFX8-NEXT: s_mov_b32 s1, 0xffff +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -272,11 +260,10 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s2, s4, s1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_and_b32 s1, s4, 0xffff +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 @@ -290,10 +277,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 -; GFX10-NEXT: s_and_b32 s1, s4, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -364,9 +350,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -386,9 +371,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 @@ -402,9 +387,9 @@ ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 @@ -420,12 +405,11 @@ ; GFX7-LABEL: insertelement_v_v2i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 @@ -439,10 +423,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -463,9 +446,8 @@ ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -477,11 +459,10 @@ ; GFX8-LABEL: insertelement_v_v2i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -495,13 +476,12 @@ ; GFX7-LABEL: insertelement_v_v2i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s2, 1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -514,12 +494,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 @@ -587,9 +566,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -665,22 +643,21 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s1, s3, 1 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_lshr_b32 s0, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v4, v5, s0, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_and_or_b32 v4, v5, s1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -689,22 +666,21 @@ ; GFX8-LABEL: insertelement_v_v4i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s3, 1 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s3, s3, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -713,22 +689,21 @@ ; GFX7-LABEL: insertelement_v_v4i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s1, s3, 1 -; GFX7-NEXT: s_and_b32 s3, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_and_b32 s1, s3, 1 +; GFX7-NEXT: s_lshr_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -737,23 +712,22 @@ ; GFX10-LABEL: insertelement_v_v4i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s1, s3, 1 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_lshl_b32 s3, s3, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_lshr_b32 s0, s3, 1 +; GFX10-NEXT: s_and_b32 s1, s3, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr @@ -768,18 +742,17 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s5, s5, s4 +; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -794,21 +767,20 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s2, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -820,18 +792,17 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s2, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s3, s1, s0 ; GFX7-NEXT: s_and_b32 s4, s4, 1 ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 ; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 @@ -845,9 +816,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 @@ -855,7 +825,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s5, s5, s4 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -877,13 +847,13 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_and_b32 s3, s4, 0xffff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -904,13 +874,13 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s3, s4, s2 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_and_b32 s3, s4, 0xffff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -932,16 +902,15 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s3, s4, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshl_b32_e32 v3, s3, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 @@ -960,12 +929,11 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 @@ -1073,10 +1041,9 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,10 +1069,10 @@ ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 @@ -1125,10 +1092,10 @@ ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 @@ -1149,13 +1116,12 @@ ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX7-NEXT: v_lshl_b32_e32 v6, s1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 @@ -1175,11 +1141,10 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 @@ -1202,20 +1167,19 @@ ; GFX9-LABEL: insertelement_v_v4i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s1, s2, 1 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s2, s2, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, s0, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_and_or_b32 v2, v5, s1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -1224,22 +1188,21 @@ ; GFX8-LABEL: insertelement_v_v4i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s2, 1 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s2, s2, 4 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1248,22 +1211,21 @@ ; GFX7-LABEL: insertelement_v_v4i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s1, s2, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1272,18 +1234,17 @@ ; GFX10-LABEL: insertelement_v_v4i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s1, s2, 1 ; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -1371,12 +1332,11 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo @@ -1399,8 +1359,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s6, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s7, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s6, 2 @@ -1409,9 +1369,9 @@ ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s8, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 @@ -1423,7 +1383,6 @@ ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -1435,8 +1394,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s6, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1 -; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s7, s1, s0 ; GFX8-NEXT: s_cmp_eq_u32 s6, 2 @@ -1445,9 +1404,9 @@ ; GFX8-NEXT: s_cselect_b32 s7, s3, s7 ; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 4 -; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s8, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5 ; GFX8-NEXT: s_or_b32 s4, s5, s4 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 @@ -1459,7 +1418,6 @@ ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -1471,7 +1429,6 @@ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s6, s5, 1 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s7, s1, s0 ; GFX7-NEXT: s_cmp_eq_u32 s6, 2 @@ -1480,9 +1437,9 @@ ; GFX7-NEXT: s_cselect_b32 s7, s3, s7 ; GFX7-NEXT: s_and_b32 s5, s5, 1 ; GFX7-NEXT: s_lshl_b32 s5, s5, 4 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s8, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 @@ -1507,9 +1464,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s6, s5, 1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s7, s1, s0 @@ -1518,9 +1474,9 @@ ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_cselect_b32 s7, s3, s7 ; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s5, s5, 4 -; GFX10-NEXT: s_lshl_b32 s8, s8, s5 +; GFX10-NEXT: s_lshl_b32 s8, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 ; GFX10-NEXT: s_andn2_b32 s5, s7, s8 ; GFX10-NEXT: s_or_b32 s4, s5, s4 @@ -1548,17 +1504,16 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 1 ; GFX9-NEXT: s_lshr_b32 s4, s3, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -1578,14 +1533,13 @@ ; GFX8-LABEL: insertelement_v_v8i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s4, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_lshl_b32 s5, s2, s1 ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -1611,14 +1565,13 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s4, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_lshl_b32 s5, s2, s1 ; GFX7-NEXT: s_not_b32 s6, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -1646,9 +1599,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_lshl_b32 s3, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_and_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s5, s5, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 ; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1677,8 +1629,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s5, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2 @@ -1687,12 +1639,11 @@ ; GFX9-NEXT: s_cselect_b32 s6, s3, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s7, s7, s4 +; GFX9-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -1713,8 +1664,8 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s5, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s5, 1 -; GFX8-NEXT: s_mov_b32 s7, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s6, s1, s0 ; GFX8-NEXT: s_cmp_eq_u32 s5, 2 @@ -1724,7 +1675,7 @@ ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s7, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 @@ -1737,7 +1688,6 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -1749,8 +1699,8 @@ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s5, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s5, 1 -; GFX7-NEXT: s_mov_b32 s7, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s6, s1, s0 ; GFX7-NEXT: s_cmp_eq_u32 s5, 2 @@ -1760,11 +1710,10 @@ ; GFX7-NEXT: s_and_b32 s4, s4, 1 ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s7, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 ; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -1785,9 +1734,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: v_and_b32_e32 v4, s7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 @@ -1801,7 +1749,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, s4 +; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 @@ -1828,15 +1776,15 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -1865,15 +1813,15 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -1903,20 +1851,19 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -1939,15 +1886,14 @@ ; GFX10-LABEL: insertelement_s_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s4, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 @@ -2091,15 +2037,15 @@ ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2130,10 +2076,10 @@ ; GFX9-LABEL: insertelement_v_v8i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -2159,10 +2105,10 @@ ; GFX8-LABEL: insertelement_v_v8i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -2192,15 +2138,14 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2221,15 +2166,14 @@ ; GFX10-LABEL: insertelement_v_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX10-NEXT: s_and_b32 s1, s2, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 @@ -2256,13 +2200,12 @@ ; GFX9-LABEL: insertelement_v_v8i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshr_b32 s4, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2284,13 +2227,12 @@ ; GFX8-LABEL: insertelement_v_v8i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s4, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2317,14 +2259,13 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s4, s2, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_not_b32 s5, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2352,9 +2293,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 ; GFX10-NEXT: s_lshl_b32 s2, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2469,15 +2409,14 @@ ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2505,8 +2444,8 @@ ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 @@ -2523,11 +2462,11 @@ ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s5, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s3, s4, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s16, s0, s3 +; GFX9-NEXT: s_or_b32 s16, s0, s2 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: s_cselect_b32 s0, s16, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 @@ -2544,7 +2483,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -2564,8 +2502,8 @@ ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s7, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 2 @@ -2582,11 +2520,11 @@ ; GFX8-NEXT: s_cselect_b32 s0, s15, s0 ; GFX8-NEXT: s_and_b32 s1, s5, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s3, s4, s2 -; GFX8-NEXT: s_lshl_b32 s3, s3, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s16, s0, s3 +; GFX8-NEXT: s_or_b32 s16, s0, s2 ; GFX8-NEXT: s_cmp_eq_u32 s7, 0 ; GFX8-NEXT: s_cselect_b32 s0, s16, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 @@ -2603,7 +2541,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -2623,7 +2560,6 @@ ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s7, s5, 1 ; GFX7-NEXT: s_cmp_eq_u32 s7, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s0, s9, s8 ; GFX7-NEXT: s_cmp_eq_u32 s7, 2 @@ -2640,11 +2576,11 @@ ; GFX7-NEXT: s_cselect_b32 s0, s15, s0 ; GFX7-NEXT: s_and_b32 s1, s5, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s3, s4, s2 -; GFX7-NEXT: s_lshl_b32 s3, s3, s1 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s16, s0, s3 +; GFX7-NEXT: s_or_b32 s16, s0, s2 ; GFX7-NEXT: s_cmp_eq_u32 s7, 0 ; GFX7-NEXT: s_cselect_b32 s0, s16, s8 ; GFX7-NEXT: s_cmp_eq_u32 s7, 1 @@ -2681,9 +2617,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s7, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2701,11 +2636,11 @@ ; GFX10-NEXT: s_cmp_eq_u32 s7, 7 ; GFX10-NEXT: s_cselect_b32 s0, s15, s0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s16, s0, s1 ; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s16, s8 @@ -2746,17 +2681,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s13, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 @@ -2795,14 +2729,13 @@ ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s12, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s13, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX8-NEXT: s_lshl_b32 s13, s2, s1 ; GFX8-NEXT: s_not_b32 s14, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -2845,14 +2778,13 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s12, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s13, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX7-NEXT: s_lshl_b32 s13, s2, s1 ; GFX7-NEXT: s_not_b32 s14, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -2892,22 +2824,21 @@ ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX10-NEXT: s_lshr_b32 s7, s3, 1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_and_b32 s8, s2, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, 2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s7, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s7, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, 5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s7, 6 -; GFX10-NEXT: s_and_b32 s9, s2, s8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 ; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 4 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: s_lshl_b32 s8, s8, s3 -; GFX10-NEXT: s_lshl_b32 s3, s9, s3 -; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s3 +; GFX10-NEXT: s_lshl_b32 s3, s8, s3 +; GFX10-NEXT: s_not_b32 s8, s9 ; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo @@ -2944,8 +2875,8 @@ ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s2, 2 @@ -2962,12 +2893,11 @@ ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 +; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s0, s0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -3003,8 +2933,8 @@ ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s2, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s2, 2 @@ -3022,7 +2952,7 @@ ; GFX8-NEXT: s_and_b32 s1, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 @@ -3051,7 +2981,6 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3062,8 +2991,8 @@ ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s2, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: s_mov_b32 s3, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s0, s9, s8 ; GFX7-NEXT: s_cmp_eq_u32 s2, 2 @@ -3081,11 +3010,10 @@ ; GFX7-NEXT: s_and_b32 s1, s4, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -3120,9 +3048,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s0, s4, 1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 1 -; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,7 +3076,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: s_lshl_b32 s2, s2, 4 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s1, s1, s3 ; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -3201,12 +3128,12 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, s22 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v9, s23 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 @@ -3261,12 +3188,12 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v7, s22 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 @@ -3322,17 +3249,16 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v7, s22 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3367,9 +3293,8 @@ ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_and_b32 s5, s4, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: s_and_b32 s6, s4, s5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 @@ -3377,10 +3302,10 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 @@ -3606,22 +3531,22 @@ ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 @@ -3663,10 +3588,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -3711,10 +3636,10 @@ ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -3761,19 +3686,18 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 @@ -3809,9 +3733,8 @@ ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_and_b32 s5, s2, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v14, 0 -; GFX10-NEXT: s_and_b32 s6, s2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 @@ -3819,10 +3742,10 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo @@ -3859,13 +3782,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s13, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -3906,13 +3828,12 @@ ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s12, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_not_b32 s13, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -3956,14 +3877,13 @@ ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s12, s2, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_not_b32 s13, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -4012,9 +3932,8 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s6, 6 ; GFX10-NEXT: s_lshl_b32 s7, s5, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s6, 7 -; GFX10-NEXT: s_mov_b32 s8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s7, s8, s7 +; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s7 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 ; GFX10-NEXT: s_not_b32 s7, s7 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -4197,18 +4116,17 @@ ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -722,33 +722,34 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_and_b32 s3, s3, 3 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -757,13 +758,12 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s3, 3 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -775,7 +775,7 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -794,18 +794,17 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, s0, s1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_lshl_b32 s1, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -813,11 +812,11 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -832,28 +831,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX10-NEXT: s_and_b32 s0, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 -; GFX10-NEXT: s_lshl_b32 s3, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: s_lshl_b32 s2, 0xff, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -877,7 +875,7 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, s5 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -887,7 +885,7 @@ ; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: s_and_b32 s3, s4, 3 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: s_lshl_b32 s4, s5, s3 +; GFX9-NEXT: s_lshl_b32 s4, 0xff, s3 ; GFX9-NEXT: s_andn2_b32 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s3, v1 @@ -904,27 +902,26 @@ ; ; GFX8-LABEL: insertelement_s_v4i8_v_s: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 -; GFX8-NEXT: s_lshr_b32 s2, s1, 24 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 -; GFX8-NEXT: s_or_b32 s3, s3, s5 -; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_or_b32 s1, s3, s1 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s4, 3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s1, s0, 24 +; GFX8-NEXT: s_and_b32 s2, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_andn2_b32 s0, s1, s0 +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 @@ -946,7 +943,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s2, s0, s5 +; GFX7-NEXT: s_and_b32 s2, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -957,7 +954,7 @@ ; GFX7-NEXT: s_and_b32 s1, s4, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s5, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -979,30 +976,29 @@ ; GFX10-LABEL: insertelement_s_v4i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 24 -; GFX10-NEXT: s_and_b32 s4, s0, s2 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 24 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_lshl_b32 s4, s2, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s3 -; GFX10-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, s1, s0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1019,14 +1015,14 @@ ; GFX9-LABEL: insertelement_s_v4i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, s5 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -1034,7 +1030,7 @@ ; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s3 -; GFX9-NEXT: s_and_b32 s3, s4, s5 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1054,14 +1050,14 @@ ; GFX8-LABEL: insertelement_s_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s1, 24 -; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s3, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 @@ -1069,7 +1065,7 @@ ; GFX8-NEXT: s_or_b32 s1, s3, s1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s0 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1091,13 +1087,13 @@ ; GFX7-LABEL: insertelement_s_v4i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s2, s0, s5 +; GFX7-NEXT: s_and_b32 s2, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -1105,7 +1101,7 @@ ; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s4, s5 +; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1131,30 +1127,29 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s4, s1 +; GFX10-NEXT: s_and_b32 s1, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-NEXT: s_and_b32 s3, s0, s1 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1171,14 +1166,14 @@ ; GFX9-LABEL: insertelement_s_v4i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s5, s0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -1205,14 +1200,14 @@ ; GFX8-LABEL: insertelement_s_v4i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s1, 24 -; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s3, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s4 @@ -1248,7 +1243,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s2, s0, s4 +; GFX7-NEXT: s_and_b32 s2, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s3, s3, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -1281,29 +1276,28 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-NEXT: s_and_b32 s3, s0, s1 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1321,62 +1315,63 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v5, v2, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 @@ -1392,37 +1387,37 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_and_b32 s0, s2, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1431,17 +1426,16 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: s_and_b32 s0, s2, s1 +; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v5, v2 @@ -1451,7 +1445,7 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1474,40 +1468,40 @@ ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 @@ -1544,7 +1538,7 @@ ; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX7-NEXT: s_lshl_b32 s1, s0, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1562,7 +1556,7 @@ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -1577,27 +1571,26 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_and_b32 s2, s2, 3 +; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: s_lshl_b32 s1, s2, 3 -; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: s_mov_b32 s0, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s1, s0, s1 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshl_b32 s0, s1, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1725,9 +1718,8 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1735,7 +1727,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 @@ -1761,28 +1753,25 @@ ; GFX9-LABEL: insertelement_s_v8i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s8, 0x80008 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s9, s0, s8 -; GFX9-NEXT: s_and_b32 s7, s0, s6 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_mov_b32 s9, 0x80010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s1, s6 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 -; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_lshl_b32 s2, s3, 24 @@ -1792,30 +1781,30 @@ ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s5, s5, 3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3 -; GFX9-NEXT: s_and_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s6, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_cmp_eq_u32 s2, 0 ; GFX9-NEXT: s_cselect_b32 s0, s3, s0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s0, s8 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 -; GFX9-NEXT: s_and_b32 s4, s0, s6 +; GFX9-NEXT: s_and_b32 s4, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s4, s0 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_bfe_u32 s4, s1, s8 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s1, s6 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 @@ -1829,28 +1818,25 @@ ; GFX8-LABEL: insertelement_s_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s8, 0x80008 -; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s9, s0, s8 -; GFX8-NEXT: s_and_b32 s7, s0, s6 -; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_or_b32 s7, s7, s9 -; GFX8-NEXT: s_mov_b32 s9, 0x80010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 +; GFX8-NEXT: s_and_b32 s6, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s7, s0 +; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s7, s1, s8 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s6 -; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 -; GFX8-NEXT: s_or_b32 s2, s2, s7 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 @@ -1860,30 +1846,30 @@ ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s5, s5, 3 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3 -; GFX8-NEXT: s_and_b32 s4, s4, s6 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s6, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX8-NEXT: s_andn2_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_cmp_eq_u32 s2, 0 ; GFX8-NEXT: s_cselect_b32 s0, s3, s0 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 ; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s0, s8 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s4, s0, s6 +; GFX8-NEXT: s_and_b32 s4, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s4, s1, s8 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s6 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -1897,26 +1883,23 @@ ; GFX7-LABEL: insertelement_s_v8i8_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s8, 0x80008 -; GFX7-NEXT: s_movk_i32 s6, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s0, s8 -; GFX7-NEXT: s_and_b32 s7, s0, s6 -; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_or_b32 s7, s7, s9 -; GFX7-NEXT: s_mov_b32 s9, 0x80010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s9 +; GFX7-NEXT: s_and_b32 s6, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s7, s0 +; GFX7-NEXT: s_or_b32 s0, s6, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s7, s1, s8 +; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s6 -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s9 -; GFX7-NEXT: s_or_b32 s2, s2, s7 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 @@ -1926,30 +1909,30 @@ ; GFX7-NEXT: s_cselect_b32 s3, s1, s0 ; GFX7-NEXT: s_and_b32 s5, s5, 3 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3 -; GFX7-NEXT: s_and_b32 s4, s4, s6 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s6, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX7-NEXT: s_andn2_b32 s3, s3, s5 ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_cmp_eq_u32 s2, 0 ; GFX7-NEXT: s_cselect_b32 s4, s3, s0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: s_cselect_b32 s3, s3, s1 -; GFX7-NEXT: s_bfe_u32 s10, s4, s8 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s4, 24 -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-NEXT: s_bfe_u32 s4, s4, s9 -; GFX7-NEXT: s_or_b32 s7, s7, s10 +; GFX7-NEXT: s_and_b32 s6, s4, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 -; GFX7-NEXT: s_or_b32 s4, s7, s4 +; GFX7-NEXT: s_or_b32 s4, s6, s4 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s2, s4, s2 -; GFX7-NEXT: s_and_b32 s4, s3, s6 -; GFX7-NEXT: s_bfe_u32 s6, s3, s8 +; GFX7-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s3, 24 +; GFX7-NEXT: s_or_b32 s2, s4, s2 +; GFX7-NEXT: s_and_b32 s4, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s9 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s4, s4, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s4, s3 @@ -1966,66 +1949,63 @@ ; GFX10-LABEL: insertelement_s_v8i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s6, 0x80010 -; GFX10-NEXT: s_lshr_b32 s7, s5, 2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s11, s0, s3 -; GFX10-NEXT: s_bfe_u32 s13, s1, s3 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s10, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s12, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s11, s11, 8 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s10, s10, s11 -; GFX10-NEXT: s_or_b32 s11, s12, s13 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 -; GFX10-NEXT: s_or_b32 s1, s11, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: s_cselect_b32 s8, s1, s0 +; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_or_b32 s8, s9, s10 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s5, s5, 3 -; GFX10-NEXT: s_and_b32 s4, s4, s2 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s9, s2, s5 +; GFX10-NEXT: s_lshl_b32 s6, 0xff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s8, s9 -; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_bfe_u32 s7, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_and_b32 s5, s0, s2 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s7 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s1, s1, s2 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s0, s3, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s4, s7, 8 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s3, s6, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s3, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -2045,47 +2025,48 @@ ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v7, v8, s3, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v6, v7, s3, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v6, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2094,15 +2075,14 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: s_lshr_b32 s1, s3, 2 -; GFX8-NEXT: s_and_b32 s3, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s3, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: s_lshr_b32 s0, s3, 2 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -2123,9 +2103,9 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -2151,63 +2131,64 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 -; GFX7-NEXT: s_lshr_b32 s0, s3, 2 -; GFX7-NEXT: s_and_b32 s2, s2, s6 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, s6, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s3, 2 +; GFX7-NEXT: s_and_b32 s3, s3, 3 +; GFX7-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s3, s3, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_lshl_b32 s3, 0xff, s3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s3, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v3, s2, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -2216,8 +2197,7 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -2225,9 +2205,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 @@ -2236,7 +2216,7 @@ ; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 -; GFX10-NEXT: s_lshl_b32 s3, s4, s1 +; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo ; GFX10-NEXT: s_not_b32 s2, s3 @@ -2251,9 +2231,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, s4, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2271,28 +2251,27 @@ ; GFX9-LABEL: insertelement_s_v8i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s9, 0x80008 -; GFX9-NEXT: s_movk_i32 s7, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: s_mov_b32 s10, 0x80010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s10 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s8, s0 +; GFX9-NEXT: s_or_b32 s0, s7, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s1, s7 -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 -; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 ; GFX9-NEXT: s_lshl_b32 s5, s6, 24 @@ -2302,59 +2281,55 @@ ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s8, s7, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s8 +; GFX9-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: s_mov_b32 s3, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v0, v2, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v2, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s7, 0x80008 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s0, s7 -; GFX8-NEXT: s_and_b32 s6, s0, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_mov_b32 s8, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s6, s0 +; GFX8-NEXT: s_or_b32 s0, s5, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 -; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 @@ -2365,7 +2340,7 @@ ; GFX8-NEXT: s_and_b32 s4, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 @@ -2397,27 +2372,25 @@ ; GFX7-LABEL: insertelement_s_v8i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s7, 0x80008 -; GFX7-NEXT: s_movk_i32 s5, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s8, s0, s7 -; GFX7-NEXT: s_and_b32 s6, s0, s5 -; GFX7-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-NEXT: s_or_b32 s6, s6, s8 -; GFX7-NEXT: s_mov_b32 s8, 0x80010 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s6, s0 +; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s5 -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 -; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 @@ -2428,34 +2401,34 @@ ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v3, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_bfe_u32 v3, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2466,57 +2439,54 @@ ; GFX10-LABEL: insertelement_s_v8i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_lshr_b32 s6, s4, 2 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX10-NEXT: s_lshr_b32 s2, s4, 2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s10, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s11, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s10, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s8, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 -; GFX10-NEXT: s_or_b32 s3, s11, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s8 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s8, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_lshl_b32 s5, s2, s4 +; GFX10-NEXT: s_lshl_b32 s5, 0xff, s4 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 @@ -2533,27 +2503,26 @@ ; GFX9-LABEL: insertelement_s_v8i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s9, 0x80008 -; GFX9-NEXT: s_movk_i32 s7, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s7, 0xff +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: s_mov_b32 s10, 0x80010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s10 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s1, s7 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 @@ -2561,9 +2530,8 @@ ; GFX9-NEXT: s_or_b32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s7 @@ -2575,16 +2543,17 @@ ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 +; GFX9-NEXT: v_and_or_b32 v5, v0, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2596,27 +2565,26 @@ ; GFX8-LABEL: insertelement_s_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s7, 0x80008 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s5, 0xff +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s0, s7 -; GFX8-NEXT: s_and_b32 s6, s0, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_mov_b32 s8, 0x80010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s6, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s5 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -2624,9 +2592,8 @@ ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_and_b32 s2, s4, s5 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 @@ -2662,27 +2629,26 @@ ; GFX7-LABEL: insertelement_s_v8i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s7, 0x80008 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s8, s0, s7 -; GFX7-NEXT: s_and_b32 s6, s0, s5 -; GFX7-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-NEXT: s_or_b32 s6, s6, s8 -; GFX7-NEXT: s_mov_b32 s8, 0x80010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s6, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s6, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 +; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s5 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -2690,9 +2656,8 @@ ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s2, s4, s5 +; GFX7-NEXT: s_and_b32 s2, s4, 0xff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 @@ -2716,7 +2681,7 @@ ; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2734,38 +2699,35 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_and_b32 s4, s4, s2 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, 0xff ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_lshl_b32 s5, s8, 8 -; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s4, s7, s5 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -2781,9 +2743,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2801,33 +2763,31 @@ ; GFX9-LABEL: insertelement_s_v8i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s8, 0x80008 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s9, s0, s8 -; GFX9-NEXT: s_and_b32 s7, s0, s6 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_mov_b32 s9, 0x80010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s7, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s1, s6 +; GFX9-NEXT: s_and_b32 s4, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -2842,16 +2802,17 @@ ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s6, v4 +; GFX9-NEXT: v_and_or_b32 v5, v0, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2863,33 +2824,31 @@ ; GFX8-LABEL: insertelement_s_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s6, 0x80008 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s0, s6 -; GFX8-NEXT: s_and_b32 s5, s0, s4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_or_b32 s5, s5, s7 -; GFX8-NEXT: s_mov_b32 s7, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s5, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s5, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s5, s1, s6 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s4 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s7 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -2928,33 +2887,31 @@ ; GFX7-LABEL: insertelement_s_v8i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s6, 0x80008 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s7, s0, s6 -; GFX7-NEXT: s_and_b32 s5, s0, s4 -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_or_b32 s5, s5, s7 -; GFX7-NEXT: s_mov_b32 s7, 0x80010 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s7 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s5, s1, s6 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s4 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s7 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 @@ -2982,7 +2939,7 @@ ; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2999,38 +2956,35 @@ ; GFX10-LABEL: insertelement_s_v8i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s4, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, 0xff ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, 8 -; GFX10-NEXT: s_or_b32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s4, s7, s4 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s3, s5, 24 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -3046,9 +3000,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3069,102 +3023,103 @@ ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_lshlrev_b32_e64 v9, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v0, v0, v12, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v10 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v10, v2, v9 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_lshlrev_b32_e64 v11, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -3176,63 +3131,64 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s3, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_and_b32 s0, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, s3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, s1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v9, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3242,7 +3198,6 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 @@ -3253,12 +3208,12 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v7 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, s3 -; GFX10-NEXT: s_and_b32 s0, s2, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, 0xff +; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX10-NEXT: v_or3_b32 v1, v1, v9, v5 @@ -3276,9 +3231,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v8, v1, s3, v2 +; GFX10-NEXT: v_and_or_b32 v8, 0xff, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3303,60 +3258,60 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, s2, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v8, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s2, 2 -; GFX8-NEXT: s_and_b32 s2, s2, 3 +; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v7, 8 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 @@ -3377,9 +3332,9 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3405,63 +3360,64 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s3, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 3 -; GFX7-NEXT: s_lshr_b32 s0, s2, 2 -; GFX7-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: s_lshl_b32 s2, s2, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_lshl_b32 s2, 0xff, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s2, s2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s3, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3470,7 +3426,6 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3478,9 +3433,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: s_lshr_b32 s1, s2, 2 ; GFX10-NEXT: s_and_b32 s0, s2, 3 @@ -3489,7 +3444,7 @@ ; GFX10-NEXT: v_or3_b32 v1, v1, v8, v4 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 @@ -3505,8 +3460,8 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v2, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v1, s3, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3700,48 +3655,46 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, v4, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xff ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v9, v5 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 -; GFX10-NEXT: v_or3_b32 v1, v1, v11, v7 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2 +; GFX10-NEXT: v_or3_b32 v1, v1, v10, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v8, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v9, v5 +; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3754,47 +3707,44 @@ ; GFX9-LABEL: insertelement_s_v16i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s12, 0x80008 -; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s13, s0, s12 -; GFX9-NEXT: s_and_b32 s11, s0, s10 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 -; GFX9-NEXT: s_or_b32 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s13, 0x80010 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s10, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s11, s0 +; GFX9-NEXT: s_or_b32 s0, s10, s0 ; GFX9-NEXT: s_lshl_b32 s6, s6, 24 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s1, s10 -; GFX9-NEXT: s_lshl_b32 s11, s11, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 -; GFX9-NEXT: s_or_b32 s6, s6, s11 +; GFX9-NEXT: s_and_b32 s6, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s6, s6, s10 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s6, s1 ; GFX9-NEXT: s_lshl_b32 s6, s7, 24 -; GFX9-NEXT: s_bfe_u32 s7, s2, s12 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s8, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s2, s10 +; GFX9-NEXT: s_and_b32 s6, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s6, s2 ; GFX9-NEXT: s_lshl_b32 s6, s8, 24 -; GFX9-NEXT: s_bfe_u32 s7, s3, s12 +; GFX9-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s3, s10 +; GFX9-NEXT: s_and_b32 s6, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s6, s3 @@ -3809,9 +3759,9 @@ ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3 -; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s10, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 @@ -3822,41 +3772,41 @@ ; GFX9-NEXT: s_cselect_b32 s2, s4, s2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_bfe_u32 s9, s0, s12 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_and_b32 s8, s0, s10 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s12 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s1, s10 +; GFX9-NEXT: s_and_b32 s4, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 -; GFX9-NEXT: s_bfe_u32 s5, s2, s12 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s4, s2, s10 +; GFX9-NEXT: s_and_b32 s4, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s4, s2 ; GFX9-NEXT: s_lshl_b32 s4, s6, 24 -; GFX9-NEXT: s_bfe_u32 s5, s3, s12 +; GFX9-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s3, s10 +; GFX9-NEXT: s_and_b32 s4, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s4, s3 @@ -3872,47 +3822,44 @@ ; GFX8-LABEL: insertelement_s_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s12, 0x80008 -; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s13, s0, s12 -; GFX8-NEXT: s_and_b32 s11, s0, s10 -; GFX8-NEXT: s_lshl_b32 s13, s13, 8 -; GFX8-NEXT: s_or_b32 s11, s11, s13 -; GFX8-NEXT: s_mov_b32 s13, 0x80010 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s10, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s11, s11, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s10, s10, s11 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s11, s0 +; GFX8-NEXT: s_or_b32 s0, s10, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 24 -; GFX8-NEXT: s_bfe_u32 s11, s1, s12 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s1, s10 -; GFX8-NEXT: s_lshl_b32 s11, s11, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 -; GFX8-NEXT: s_or_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s6, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s10 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s6, s1 ; GFX8-NEXT: s_lshl_b32 s6, s7, 24 -; GFX8-NEXT: s_bfe_u32 s7, s2, s12 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s6, s2, s10 +; GFX8-NEXT: s_and_b32 s6, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s6, s2 ; GFX8-NEXT: s_lshl_b32 s6, s8, 24 -; GFX8-NEXT: s_bfe_u32 s7, s3, s12 +; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s9, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s3, s10 +; GFX8-NEXT: s_and_b32 s6, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s6, s3 @@ -3927,9 +3874,9 @@ ; GFX8-NEXT: s_cselect_b32 s7, s3, s7 ; GFX8-NEXT: s_and_b32 s5, s5, 3 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3 -; GFX8-NEXT: s_and_b32 s4, s4, s10 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s10, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5 ; GFX8-NEXT: s_or_b32 s4, s5, s4 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 @@ -3940,41 +3887,41 @@ ; GFX8-NEXT: s_cselect_b32 s2, s4, s2 ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_bfe_u32 s9, s0, s12 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_and_b32 s8, s0, s10 +; GFX8-NEXT: s_and_b32 s8, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_bfe_u32 s8, s1, s12 +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s1, s10 +; GFX8-NEXT: s_and_b32 s4, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s5, 24 -; GFX8-NEXT: s_bfe_u32 s5, s2, s12 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s2, s10 +; GFX8-NEXT: s_and_b32 s4, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s6, 24 -; GFX8-NEXT: s_bfe_u32 s5, s3, s12 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, s3, s10 +; GFX8-NEXT: s_and_b32 s4, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s4, s3 @@ -3990,45 +3937,42 @@ ; GFX7-LABEL: insertelement_s_v16i8_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s12, 0x80008 -; GFX7-NEXT: s_movk_i32 s10, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s13, s0, s12 -; GFX7-NEXT: s_and_b32 s11, s0, s10 -; GFX7-NEXT: s_lshl_b32 s13, s13, 8 -; GFX7-NEXT: s_or_b32 s11, s11, s13 -; GFX7-NEXT: s_mov_b32 s13, 0x80010 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s13 +; GFX7-NEXT: s_and_b32 s10, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s10, s10, s11 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s11, s0 +; GFX7-NEXT: s_or_b32 s0, s10, s0 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_bfe_u32 s11, s1, s12 +; GFX7-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s7, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s6 -; GFX7-NEXT: s_and_b32 s6, s1, s10 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s13 -; GFX7-NEXT: s_or_b32 s6, s6, s11 +; GFX7-NEXT: s_and_b32 s6, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s10 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s6, s1 ; GFX7-NEXT: s_lshl_b32 s6, s7, 24 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 +; GFX7-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s6 -; GFX7-NEXT: s_and_b32 s6, s2, s10 +; GFX7-NEXT: s_and_b32 s6, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 +; GFX7-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s3, s10 +; GFX7-NEXT: s_and_b32 s6, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s6, s3 @@ -4043,9 +3987,9 @@ ; GFX7-NEXT: s_cselect_b32 s7, s3, s7 ; GFX7-NEXT: s_and_b32 s5, s5, 3 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3 -; GFX7-NEXT: s_and_b32 s4, s4, s10 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s10, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 @@ -4056,41 +4000,41 @@ ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: s_bfe_u32 s14, s5, s12 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s5, 24 -; GFX7-NEXT: s_and_b32 s11, s5, s10 -; GFX7-NEXT: s_lshl_b32 s14, s14, 8 -; GFX7-NEXT: s_bfe_u32 s5, s5, s13 -; GFX7-NEXT: s_or_b32 s11, s11, s14 +; GFX7-NEXT: s_and_b32 s10, s5, 0xff +; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-NEXT: s_or_b32 s10, s10, s11 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 -; GFX7-NEXT: s_or_b32 s5, s11, s5 +; GFX7-NEXT: s_or_b32 s5, s10, s5 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s11, s7, s12 +; GFX7-NEXT: s_bfe_u32 s10, s7, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s7, 24 ; GFX7-NEXT: s_or_b32 s4, s5, s4 -; GFX7-NEXT: s_and_b32 s5, s7, s10 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_bfe_u32 s7, s7, s13 -; GFX7-NEXT: s_or_b32 s5, s5, s11 +; GFX7-NEXT: s_and_b32 s5, s7, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s10 ; GFX7-NEXT: s_lshl_b32 s7, s7, 16 ; GFX7-NEXT: s_or_b32 s5, s5, s7 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 +; GFX7-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 ; GFX7-NEXT: s_or_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s6, s2, s10 +; GFX7-NEXT: s_and_b32 s6, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 +; GFX7-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 ; GFX7-NEXT: s_or_b32 s6, s2, s6 -; GFX7-NEXT: s_and_b32 s2, s3, s10 +; GFX7-NEXT: s_and_b32 s2, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -4109,112 +4053,109 @@ ; GFX10-LABEL: insertelement_s_v16i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x80008 -; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: s_mov_b32 s8, 0x80010 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s14, s0, s7 -; GFX10-NEXT: s_lshr_b32 s9, s0, 24 -; GFX10-NEXT: s_and_b32 s13, s0, s6 -; GFX10-NEXT: s_bfe_u32 s0, s0, s8 -; GFX10-NEXT: s_bfe_u32 s16, s1, s7 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_lshr_b32 s10, s1, 24 -; GFX10-NEXT: s_and_b32 s15, s1, s6 -; GFX10-NEXT: s_bfe_u32 s1, s1, s8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_or_b32 s13, s13, s14 -; GFX10-NEXT: s_bfe_u32 s18, s2, s7 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_bfe_u32 s11, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s13, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s7, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s0, 0xff +; GFX10-NEXT: s_and_b32 s12, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s14, s15, s16 -; GFX10-NEXT: s_or_b32 s0, s13, s0 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 24 -; GFX10-NEXT: s_or_b32 s1, s14, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s9, s18, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s9, s17, s9 +; GFX10-NEXT: s_or_b32 s10, s10, s11 +; GFX10-NEXT: s_or_b32 s11, s12, s13 +; GFX10-NEXT: s_bfe_u32 s15, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s7, s7, 24 +; GFX10-NEXT: s_or_b32 s1, s11, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 24 +; GFX10-NEXT: s_and_b32 s14, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_lshl_b32 s15, s15, 8 +; GFX10-NEXT: s_or_b32 s0, s10, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s9, s3, 24 +; GFX10-NEXT: s_or_b32 s12, s14, s15 +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s1, s1, s10 -; GFX10-NEXT: s_bfe_u32 s10, s3, s7 -; GFX10-NEXT: s_lshr_b32 s12, s3, 24 -; GFX10-NEXT: s_or_b32 s2, s9, s2 -; GFX10-NEXT: s_lshl_b32 s9, s11, 24 -; GFX10-NEXT: s_and_b32 s11, s3, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s8 -; GFX10-NEXT: s_or_b32 s10, s11, s10 +; GFX10-NEXT: s_lshl_b32 s6, s8, 24 +; GFX10-NEXT: s_and_b32 s8, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s12, s2 +; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s9 -; GFX10-NEXT: s_or_b32 s3, s10, s3 -; GFX10-NEXT: s_lshl_b32 s9, s12, 24 -; GFX10-NEXT: s_lshr_b32 s10, s5, 2 -; GFX10-NEXT: s_or_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_eq_u32 s10, 1 -; GFX10-NEXT: s_cselect_b32 s9, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 2 -; GFX10-NEXT: s_cselect_b32 s9, s2, s9 -; GFX10-NEXT: s_cmp_eq_u32 s10, 3 -; GFX10-NEXT: s_cselect_b32 s9, s3, s9 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s7, s3 +; GFX10-NEXT: s_lshl_b32 s6, s9, 24 +; GFX10-NEXT: s_lshr_b32 s7, s5, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: s_cselect_b32 s6, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s6, s2, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s5, s5, 3 -; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s11, s6, s5 +; GFX10-NEXT: s_lshl_b32 s8, 0xff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s9, s11 +; GFX10-NEXT: s_andn2_b32 s5, s6, s8 ; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 1 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_cmp_eq_u32 s10, 2 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_eq_u32 s10, 3 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 ; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_bfe_u32 s10, s0, s7 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s10, s11, s10 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 -; GFX10-NEXT: s_bfe_u32 s10, s1, s7 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 24 -; GFX10-NEXT: s_and_b32 s12, s1, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s8 -; GFX10-NEXT: s_or_b32 s10, s12, s10 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s7, s9, s7 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s4, s5, 24 -; GFX10-NEXT: s_bfe_u32 s5, s2, s7 -; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_or_b32 s1, s10, s1 -; GFX10-NEXT: s_and_b32 s10, s2, s6 +; GFX10-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_and_b32 s7, s2, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_or_b32 s5, s7, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_bfe_u32 s4, s3, s7 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 +; GFX10-NEXT: s_bfe_u32 s4, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 ; GFX10-NEXT: s_or_b32 s2, s5, s2 -; GFX10-NEXT: s_and_b32 s5, s3, s6 +; GFX10-NEXT: s_and_b32 s5, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX10-NEXT: s_or_b32 s4, s5, s4 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s5, s9, 24 +; GFX10-NEXT: s_lshl_b32 s5, s6, 24 ; GFX10-NEXT: s_or_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s11, 24 +; GFX10-NEXT: s_lshl_b32 s4, s8, 24 ; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -4235,19 +4176,18 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NEXT: v_mov_b32_e32 v7, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 2 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s6, s3 -; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 +; GFX9-NEXT: s_not_b32 s6, s3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4258,57 +4198,59 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v13 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v17 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 ; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v3, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v13, v3, v6, v19 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 2 +; GFX9-NEXT: v_mov_b32_e32 v15, s2 ; GFX9-NEXT: v_or3_b32 v3, v13, v3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, 3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v8, v9, s5, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v14 +; GFX9-NEXT: v_and_or_b32 v9, v9, s6, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v16 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v0, v13, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 -; GFX9-NEXT: v_or3_b32 v2, v2, v17, v10 -; GFX9-NEXT: v_or3_b32 v3, v3, v7, v6 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v3, v3, v6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v3, v8, v6 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -4319,14 +4261,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v9, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_lshl_b32 s5, s2, s1 ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -4408,108 +4349,109 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff +; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 -; GFX7-NEXT: s_and_b32 s1, s2, s6 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s5, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s7, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: s_not_b32 s6, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v14, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v15, v3, v4 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s7, v4 -; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, s5, v5 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -4520,11 +4462,10 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_mov_b32_e32 v5, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 2 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 2 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -4534,34 +4475,34 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v16 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v9 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 ; GFX10-NEXT: v_or3_b32 v3, v3, v10, v6 ; GFX10-NEXT: s_lshl_b32 s3, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v2, s0 -; GFX10-NEXT: s_lshl_b32 s6, s4, s3 +; GFX10-NEXT: s_lshl_b32 s5, 0xff, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v3, s1 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 @@ -4578,13 +4519,13 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -4604,47 +4545,46 @@ ; GFX9-LABEL: insertelement_s_v16i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s13, 0x80008 -; GFX9-NEXT: s_movk_i32 s11, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s11, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_mov_b32 s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s14, s0, s13 -; GFX9-NEXT: s_and_b32 s12, s0, s11 -; GFX9-NEXT: s_lshl_b32 s14, s14, 8 -; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: s_mov_b32 s14, 0x80010 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s14 +; GFX9-NEXT: s_and_b32 s11, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s12, s0 +; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s7, s7, 24 -; GFX9-NEXT: s_bfe_u32 s12, s1, s13 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s1, s11 -; GFX9-NEXT: s_lshl_b32 s12, s12, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 -; GFX9-NEXT: s_or_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s7, s1 ; GFX9-NEXT: s_lshl_b32 s7, s8, 24 -; GFX9-NEXT: s_bfe_u32 s8, s2, s13 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s7 -; GFX9-NEXT: s_and_b32 s7, s2, s11 +; GFX9-NEXT: s_and_b32 s7, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s14 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s7, s2 ; GFX9-NEXT: s_lshl_b32 s7, s9, 24 -; GFX9-NEXT: s_bfe_u32 s8, s3, s13 +; GFX9-NEXT: s_bfe_u32 s8, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s10, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s3, s11 +; GFX9-NEXT: s_and_b32 s7, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s14 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s7, s3 @@ -4659,48 +4599,47 @@ ; GFX9-NEXT: s_cselect_b32 s8, s3, s8 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s9, s11, s4 +; GFX9-NEXT: s_lshl_b32 s9, 0xff, s4 ; GFX9-NEXT: s_andn2_b32 s8, s8, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 3 -; GFX9-NEXT: s_mov_b32 s6, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v8, v0, s11, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v9, v0, v4, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v4, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -4710,47 +4649,44 @@ ; GFX8-LABEL: insertelement_s_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s11, 0x80008 -; GFX8-NEXT: s_movk_i32 s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s12, s0, s11 -; GFX8-NEXT: s_and_b32 s10, s0, s9 -; GFX8-NEXT: s_lshl_b32 s12, s12, 8 -; GFX8-NEXT: s_or_b32 s10, s10, s12 -; GFX8-NEXT: s_mov_b32 s12, 0x80010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s9, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s10, s0 +; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 -; GFX8-NEXT: s_bfe_u32 s10, s1, s11 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s1, s9 -; GFX8-NEXT: s_lshl_b32 s10, s10, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s12 -; GFX8-NEXT: s_or_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s5, s1 ; GFX8-NEXT: s_lshl_b32 s5, s6, 24 -; GFX8-NEXT: s_bfe_u32 s6, s2, s11 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s5 -; GFX8-NEXT: s_and_b32 s5, s2, s9 +; GFX8-NEXT: s_and_b32 s5, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s12 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s5, s2 ; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_bfe_u32 s6, s3, s11 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s8, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s5 -; GFX8-NEXT: s_and_b32 s5, s3, s9 +; GFX8-NEXT: s_and_b32 s5, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s12 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s5, s3 @@ -4766,7 +4702,7 @@ ; GFX8-NEXT: s_and_b32 s4, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s9, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 @@ -4820,46 +4756,44 @@ ; GFX7-LABEL: insertelement_s_v16i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s11, 0x80008 -; GFX7-NEXT: s_movk_i32 s9, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s12, s0, s11 -; GFX7-NEXT: s_and_b32 s10, s0, s9 -; GFX7-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-NEXT: s_or_b32 s10, s10, s12 -; GFX7-NEXT: s_mov_b32 s12, 0x80010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s12 +; GFX7-NEXT: s_and_b32 s9, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s9, s9, s10 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s10, s0 +; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 -; GFX7-NEXT: s_bfe_u32 s10, s1, s11 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s1, s9 -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s12 -; GFX7-NEXT: s_or_b32 s5, s5, s10 +; GFX7-NEXT: s_and_b32 s5, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s5, s1 ; GFX7-NEXT: s_lshl_b32 s5, s6, 24 -; GFX7-NEXT: s_bfe_u32 s6, s2, s11 +; GFX7-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s5 -; GFX7-NEXT: s_and_b32 s5, s2, s9 +; GFX7-NEXT: s_and_b32 s5, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s12 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s5, s2 ; GFX7-NEXT: s_lshl_b32 s5, s7, 24 -; GFX7-NEXT: s_bfe_u32 s6, s3, s11 +; GFX7-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s3, 24 ; GFX7-NEXT: s_or_b32 s2, s2, s5 -; GFX7-NEXT: s_and_b32 s5, s3, s9 +; GFX7-NEXT: s_and_b32 s5, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s12 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s5, s3 @@ -4875,60 +4809,60 @@ ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s9, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s9, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4939,81 +4873,78 @@ ; GFX10-LABEL: insertelement_s_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s9, s0, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_lshl_b32 s8, s17, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_and_b32 s9, s3, s5 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 -; GFX10-NEXT: s_or_b32 s8, s16, s8 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_or_b32 s10, s11, s12 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s10, s1 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s9, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 +; GFX10-NEXT: s_or_b32 s11, s13, s14 +; GFX10-NEXT: s_or_b32 s0, s0, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s6, s9, s6 +; GFX10-NEXT: s_lshl_b32 s5, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s11, s2 +; GFX10-NEXT: s_or_b32 s6, s7, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_or_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s8, s10, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s3, s6, s3 -; GFX10-NEXT: s_lshl_b32 s6, s11, 24 -; GFX10-NEXT: s_lshr_b32 s7, s4, 2 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s3, s3, s6 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 0 -; GFX10-NEXT: s_cselect_b32 s6, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s7, 2 -; GFX10-NEXT: s_cselect_b32 s6, s2, s6 -; GFX10-NEXT: s_cmp_eq_u32 s7, 3 -; GFX10-NEXT: s_cselect_b32 s6, s3, s6 +; GFX10-NEXT: s_lshl_b32 s5, s8, 24 +; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX10-NEXT: s_cselect_b32 s5, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b32 s5, s2, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b32 s5, s3, s5 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s8, s5, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, s8 -; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 +; GFX10-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX10-NEXT: s_andn2_b32 s5, s5, s7 +; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 2 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 @@ -5021,15 +4952,15 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5050,60 +4981,58 @@ ; GFX9-LABEL: insertelement_s_v16i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s13, 0x80008 -; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s14, s0, s13 -; GFX9-NEXT: s_and_b32 s8, s0, s12 -; GFX9-NEXT: s_lshl_b32 s14, s14, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s14 -; GFX9-NEXT: s_mov_b32 s14, 0x80010 +; GFX9-NEXT: s_bfe_u32 s13, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s14 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s8, s8, s13 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 ; GFX9-NEXT: s_or_b32 s8, s0, s5 -; GFX9-NEXT: s_bfe_u32 s5, s1, s13 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_and_b32 s0, s1, s12 +; GFX9-NEXT: s_and_b32 s0, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s0, s0, s5 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s9, 24 ; GFX9-NEXT: s_or_b32 s9, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s13 -; GFX9-NEXT: s_and_b32 s0, s2, s12 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s14 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s10, 24 ; GFX9-NEXT: s_or_b32 s10, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s13 -; GFX9-NEXT: s_and_b32 s0, s3, s12 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s14 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX9-NEXT: s_lshr_b32 s11, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s11, 24 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_or_b32 s11, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s12 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -5111,43 +5040,44 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: s_mov_b32 s7, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v8, v0, s12, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_and_or_b32 v9, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s12, v4 +; GFX9-NEXT: v_and_or_b32 v4, v1, v5, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v5, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -5157,60 +5087,58 @@ ; GFX8-LABEL: insertelement_s_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s13, 0x80008 -; GFX8-NEXT: s_movk_i32 s12, 0xff -; GFX8-NEXT: s_mov_b32 s14, 0x80010 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s12, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s9, s0, s13 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_and_b32 s8, s0, s12 +; GFX8-NEXT: s_and_b32 s8, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s14 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 ; GFX8-NEXT: s_or_b32 s8, s0, s5 -; GFX8-NEXT: s_bfe_u32 s5, s1, s13 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 -; GFX8-NEXT: s_and_b32 s0, s1, s12 +; GFX8-NEXT: s_and_b32 s0, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s14 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s9, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s13 -; GFX8-NEXT: s_and_b32 s0, s2, s12 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s14 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: s_or_b32 s10, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s13 -; GFX8-NEXT: s_and_b32 s0, s3, s12 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s14 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX8-NEXT: s_lshr_b32 s11, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s11, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_or_b32 s11, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s12 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -5269,117 +5197,115 @@ ; GFX7-LABEL: insertelement_s_v16i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s13, 0x80008 -; GFX7-NEXT: s_movk_i32 s12, 0xff -; GFX7-NEXT: s_mov_b32 s14, 0x80010 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s0, s13 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 -; GFX7-NEXT: s_and_b32 s8, s0, s12 +; GFX7-NEXT: s_and_b32 s8, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, s14 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s8, s8, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s8, s0 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 ; GFX7-NEXT: s_or_b32 s8, s0, s5 -; GFX7-NEXT: s_bfe_u32 s5, s1, s13 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 -; GFX7-NEXT: s_and_b32 s0, s1, s12 +; GFX7-NEXT: s_and_b32 s0, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s14 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s9, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s13 -; GFX7-NEXT: s_and_b32 s0, s2, s12 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s14 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: s_or_b32 s10, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s13 -; GFX7-NEXT: s_and_b32 s0, s3, s12 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s14 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX7-NEXT: s_lshr_b32 s11, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s11, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_or_b32 s11, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s12 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s12, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, 0xff +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s12, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5390,54 +5316,51 @@ ; GFX10-LABEL: insertelement_s_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xff ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 +; GFX10-NEXT: s_or_b32 s8, s8, s9 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_or_b32 s8, s0, s8 -; GFX10-NEXT: s_lshl_b32 s0, s17, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s0, s16, s0 -; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s9, s1, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s10, 24 -; GFX10-NEXT: s_bfe_u32 s2, s3, s6 +; GFX10-NEXT: s_or_b32 s9, s10, s12 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_or_b32 s10, s13, s14 +; GFX10-NEXT: s_or_b32 s8, s0, s5 +; GFX10-NEXT: s_lshl_b32 s0, s2, 16 +; GFX10-NEXT: s_or_b32 s9, s1, s6 +; GFX10-NEXT: s_or_b32 s0, s10, s0 +; GFX10-NEXT: s_lshl_b32 s1, s7, 24 +; GFX10-NEXT: s_bfe_u32 s2, s3, 0x80008 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_and_b32 s6, s3, s5 +; GFX10-NEXT: s_and_b32 s5, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 ; GFX10-NEXT: s_or_b32 s10, s0, s1 -; GFX10-NEXT: s_bfe_u32 s1, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s6, s2 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s0, s5, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 @@ -5448,7 +5371,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: s_or_b32 s11, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: s_and_b32 s2, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -5470,16 +5393,16 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5500,54 +5423,52 @@ ; GFX9-LABEL: insertelement_s_v16i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s12, 0x80008 -; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s13, s0, s12 -; GFX9-NEXT: s_and_b32 s11, s0, s10 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 -; GFX9-NEXT: s_or_b32 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s13, 0x80010 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s11, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s4, s0, s4 -; GFX9-NEXT: s_and_b32 s0, s1, s10 +; GFX9-NEXT: s_and_b32 s0, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s0, s0, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s5, 24 ; GFX9-NEXT: s_or_b32 s5, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s12 -; GFX9-NEXT: s_and_b32 s0, s2, s10 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s13 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s6, 24 ; GFX9-NEXT: s_or_b32 s6, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s12 -; GFX9-NEXT: s_and_b32 s0, s3, s10 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s13 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s7, 24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_or_b32 s7, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5560,43 +5481,44 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s8, 8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: s_mov_b32 s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v8, v0, s10, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_and_or_b32 v9, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s10, v4 +; GFX9-NEXT: v_and_or_b32 v4, v1, v5, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v5, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -5606,54 +5528,52 @@ ; GFX8-LABEL: insertelement_s_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s10, 0x80008 -; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s11, s0, s10 -; GFX8-NEXT: s_and_b32 s9, s0, s8 -; GFX8-NEXT: s_lshl_b32 s11, s11, 8 -; GFX8-NEXT: s_or_b32 s9, s9, s11 -; GFX8-NEXT: s_mov_b32 s11, 0x80010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s9, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_bfe_u32 s9, s1, s10 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 ; GFX8-NEXT: s_or_b32 s4, s0, s4 -; GFX8-NEXT: s_and_b32 s0, s1, s8 +; GFX8-NEXT: s_and_b32 s0, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s11 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s0, s0, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s5, 24 ; GFX8-NEXT: s_or_b32 s5, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s10 -; GFX8-NEXT: s_and_b32 s0, s2, s8 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s11 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s6, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s10 -; GFX8-NEXT: s_and_b32 s0, s3, s8 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s11 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_or_b32 s7, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5717,54 +5637,52 @@ ; GFX7-LABEL: insertelement_s_v16i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s10, 0x80008 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s11, s0, s10 -; GFX7-NEXT: s_and_b32 s9, s0, s8 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_or_b32 s9, s9, s11 -; GFX7-NEXT: s_mov_b32 s11, 0x80010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s11 +; GFX7-NEXT: s_and_b32 s9, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s9, s9, s10 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s9, s1, s10 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 ; GFX7-NEXT: s_or_b32 s4, s0, s4 -; GFX7-NEXT: s_and_b32 s0, s1, s8 +; GFX7-NEXT: s_and_b32 s0, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s11 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s0, s0, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s5, 24 ; GFX7-NEXT: s_or_b32 s5, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s10 -; GFX7-NEXT: s_and_b32 s0, s2, s8 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s11 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX7-NEXT: s_lshr_b32 s6, s2, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s6, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s10 -; GFX7-NEXT: s_and_b32 s0, s3, s8 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s11 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX7-NEXT: s_lshr_b32 s7, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_or_b32 s7, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5779,55 +5697,56 @@ ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, 0xff +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5838,65 +5757,62 @@ ; GFX10-LABEL: insertelement_s_v16i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x80008 -; GFX10-NEXT: s_movk_i32 s8, 0xff -; GFX10-NEXT: s_mov_b32 s9, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s8 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s12, s0, s7 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s8 -; GFX10-NEXT: s_bfe_u32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s11, s11, s12 -; GFX10-NEXT: s_bfe_u32 s16, s2, s7 -; GFX10-NEXT: s_lshl_b32 s4, s4, 24 -; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_bfe_u32 s14, s1, s7 -; GFX10-NEXT: s_and_b32 s15, s2, s8 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_or_b32 s4, s0, s4 -; GFX10-NEXT: s_bfe_u32 s0, s2, s9 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_and_b32 s13, s1, s8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_lshr_b32 s6, s2, 24 -; GFX10-NEXT: s_or_b32 s2, s15, s16 +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_lshl_b32 s4, s4, 24 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s12, s13, s14 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_lshl_b32 s2, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s12, s1 -; GFX10-NEXT: s_lshl_b32 s5, s5, 24 -; GFX10-NEXT: s_or_b32 s6, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s3, s7 -; GFX10-NEXT: s_or_b32 s5, s1, s5 -; GFX10-NEXT: s_and_b32 s1, s3, s8 +; GFX10-NEXT: s_or_b32 s9, s10, s11 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_or_b32 s4, s0, s4 +; GFX10-NEXT: s_lshl_b32 s0, s5, 24 +; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008 +; GFX10-NEXT: s_or_b32 s5, s1, s0 +; GFX10-NEXT: s_bfe_u32 s0, s3, 0x80008 +; GFX10-NEXT: s_and_b32 s1, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_or_b32 s0, s1, s0 -; GFX10-NEXT: s_bfe_u32 s1, s3, s9 -; GFX10-NEXT: s_lshr_b32 s10, s3, 24 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s10, s12, s13 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 -; GFX10-NEXT: s_lshl_b32 s2, s10, 24 +; GFX10-NEXT: s_lshr_b32 s7, s3, 24 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 ; GFX10-NEXT: s_mov_b32 s3, 8 +; GFX10-NEXT: s_or_b32 s6, s2, s6 +; GFX10-NEXT: s_lshl_b32 s2, s7, 24 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: s_or_b32 s7, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 ; GFX10-NEXT: v_and_or_b32 v5, v2, v1, v0 @@ -5917,16 +5833,16 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v2, s8, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5948,171 +5864,172 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX9-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX9-NEXT: v_and_or_b32 v14, v5, v0, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 -; GFX9-NEXT: s_and_b32 s0, s2, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v8, v12, v15, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v18 +; GFX9-NEXT: v_lshlrev_b32_e64 v18, v2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX9-NEXT: v_or3_b32 v9, v14, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v9, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v16 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, v2, v18 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v8, v8, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v6 +; GFX9-NEXT: v_or3_b32 v1, v8, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v9, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v7, v12 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_and_b32 s1, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v9, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v14, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e64 v18, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v7, v14, v17 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v16 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v8 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i8_s_v: @@ -6121,108 +6038,109 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v7, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_and_b32 s0, s2, s6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX7-NEXT: v_lshl_b32_e32 v18, s0, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 -; GFX7-NEXT: v_lshl_b32_e32 v2, s6, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_lshl_b32_e32 v19, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v19 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6233,7 +6151,6 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 @@ -6248,25 +6165,25 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v16 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 ; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v18 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 ; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 -; GFX10-NEXT: s_and_b32 s1, s2, s3 -; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, s3 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, 0xff ; GFX10-NEXT: v_or3_b32 v6, v6, v12, v8 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v5, s0 @@ -6291,13 +6208,13 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v10 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 +; GFX10-NEXT: v_and_or_b32 v14, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -6319,95 +6236,95 @@ ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s6, s2 -; GFX9-NEXT: s_not_b32 s5, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX9-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v14, v5, v0, v16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v8, v12, v15, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v18 +; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_or3_b32 v9, v14, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, s5, v2 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v9, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, s5, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v8, v8, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v6 +; GFX9-NEXT: v_or3_b32 v1, v8, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v9, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v7, v12 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v11, s0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 @@ -6490,108 +6407,109 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v7, 0xff ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 +; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc -; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v2, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6602,10 +6520,9 @@ ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: s_lshr_b32 s3, s2, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v4 @@ -6615,34 +6532,34 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v11 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v13 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v15 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v3, v3, v12, v7 ; GFX10-NEXT: v_or3_b32 v4, v4, v14, v8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v17 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v10 ; GFX10-NEXT: v_or3_b32 v5, v5, v16, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: v_or3_b32 v6, v6, v11, v7 ; GFX10-NEXT: s_lshl_b32 s2, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, v5, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 ; GFX10-NEXT: v_and_or_b32 v2, v7, s2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 @@ -6659,13 +6576,13 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v10 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, v12 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v5, s3, v14 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v5, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -6969,50 +6886,48 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v8, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff -; GFX10-NEXT: v_mov_b32_e32 v9, 16 +; GFX10-NEXT: v_mov_b32_e32 v8, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v4, v4, s2, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v5, v5, 0xff, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v5, v5, s2, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v6, v1, v18 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX10-NEXT: v_or3_b32 v4, v4, v15, v10 -; GFX10-NEXT: v_or3_b32 v5, v5, v17, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v13 -; GFX10-NEXT: v_or3_b32 v6, v6, v19, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc_lo +; GFX10-NEXT: v_or3_b32 v4, v4, v14, v9 +; GFX10-NEXT: v_or3_b32 v5, v5, v16, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v7, 0xff, v7, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12 +; GFX10-NEXT: v_or3_b32 v6, v6, v18, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, v0, v1 -; GFX10-NEXT: v_or3_b32 v7, v7, v14, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v0, 0xff +; GFX10-NEXT: v_or3_b32 v7, v7, v13, v9 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v6, s0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v7, s1 -; GFX10-NEXT: v_and_or_b32 v0, v10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v6, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1 +; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 @@ -7020,29 +6935,29 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v14, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v0, v2, v12, v11 -; GFX10-NEXT: v_or3_b32 v1, v3, v14, v6 -; GFX10-NEXT: v_or3_b32 v2, v13, v16, v7 -; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 +; GFX10-NEXT: v_or3_b32 v0, v2, v11, v10 +; GFX10-NEXT: v_or3_b32 v1, v3, v13, v6 +; GFX10-NEXT: v_or3_b32 v2, v12, v15, v7 +; GFX10-NEXT: v_or3_b32 v3, v14, v8, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -1261,10 +1261,9 @@ ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -505,15 +505,14 @@ ; ; GFX10-LABEL: atomic_add_i32_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -549,15 +548,14 @@ ; ; GFX10-LABEL: atomic_add_i32_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -632,15 +630,14 @@ ; ; GFX10-LABEL: atomic_add_i32_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -676,15 +673,14 @@ ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -720,14 +716,13 @@ ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1279,15 +1274,14 @@ ; ; GFX10-LABEL: atomic_add_i64_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1323,15 +1317,14 @@ ; ; GFX10-LABEL: atomic_add_i64_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1406,15 +1399,14 @@ ; ; GFX10-LABEL: atomic_add_i64_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1450,15 +1442,14 @@ ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1494,14 +1485,13 @@ ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, v5 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -86,7 +86,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -100,8 +99,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -143,7 +142,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -157,8 +155,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -252,7 +250,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -266,8 +263,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -309,7 +306,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -323,8 +319,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -470,7 +466,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -484,8 +479,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -527,7 +522,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -541,8 +535,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10NSA-NEXT: v_and_or_b32 v3, 0xffff, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -578,7 +572,6 @@ ; ; GFX10NSA-LABEL: gather4_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 @@ -587,8 +580,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 @@ -629,7 +622,6 @@ ; ; GFX10NSA-LABEL: gather4_c_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 @@ -638,8 +630,8 @@ ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -508,12 +508,11 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16 -; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff -; GFX8-UNPACKED-NEXT: s_and_b32 s1, s0, s0 -; GFX8-UNPACKED-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-UNPACKED-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-UNPACKED-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v4, 0xffff, v1 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -530,8 +529,7 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 -; GFX8-PACKED-NEXT: s_mov_b32 s0, 0xffff -; GFX8-PACKED-NEXT: s_and_b32 s0, s0, s0 +; GFX8-PACKED-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-PACKED-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -24,14 +24,13 @@ ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -79,17 +78,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -148,17 +146,16 @@ ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -24,15 +24,14 @@ ; ; GFX10-LABEL: load_3d_v4f32_xyzw: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -79,7 +78,6 @@ ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -90,8 +88,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -148,7 +146,6 @@ ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -159,8 +156,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -4,10 +4,9 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -19,11 +18,10 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -37,14 +35,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10-NEXT: v_and_or_b32 v2, v0, v11, v1 -; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v11, s12 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -56,10 +53,9 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -71,11 +67,10 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -87,10 +82,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -102,11 +96,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -118,10 +111,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -136,11 +128,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -152,10 +143,9 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -167,11 +157,10 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -183,10 +172,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -198,11 +186,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -214,10 +201,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -229,11 +215,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -245,10 +230,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -263,11 +247,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -284,11 +267,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -305,11 +287,10 @@ ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -61,15 +61,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NEXT: v_and_b32_e32 v10, s4, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v8 ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_and_b32_e32 v9, s4, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10 +; GCN-NEXT: v_and_or_b32 v5, v6, 0xffff, v5 +; GCN-NEXT: v_and_or_b32 v6, v7, 0xffff, v10 ; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -126,15 +125,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GCN-NEXT: v_and_b32_e32 v11, s4, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v9 ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_and_b32_e32 v10, s4, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6 -; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11 +; GCN-NEXT: v_and_or_b32 v6, v7, 0xffff, v6 +; GCN-NEXT: v_and_or_b32 v7, v8, 0xffff, v11 ; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -232,21 +230,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v14, v0 ; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 +; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 +; GFX1030-NEXT: v_and_b32_e32 v3, 0xffff, v9 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v20, v7, s0, v1 +; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v0 +; GFX1030-NEXT: v_and_or_b32 v20, v7, 0xffff, v1 ; GFX1030-NEXT: v_lshl_or_b32 v21, v3, 16, v2 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 @@ -277,16 +274,15 @@ ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8 +; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v8 ; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5 -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14 +; GFX1013-NEXT: v_and_or_b32 v5, v6, 0xffff, v5 +; GFX1013-NEXT: v_and_or_b32 v6, v7, 0xffff, v14 ; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 @@ -407,21 +403,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v15, v0 ; GFX1030-NEXT: v_mov_b32_e32 v16, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 +; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; GFX1030-NEXT: v_mov_b32_e32 v17, v2 ; GFX1030-NEXT: v_mov_b32_e32 v18, v3 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 +; GFX1030-NEXT: v_and_b32_e32 v3, 0xffff, v10 ; GFX1030-NEXT: v_mov_b32_e32 v19, v4 ; GFX1030-NEXT: v_mov_b32_e32 v20, v5 -; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v22, v8, s0, v1 +; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v0 +; GFX1030-NEXT: v_and_or_b32 v22, v8, 0xffff, v1 ; GFX1030-NEXT: v_lshl_or_b32 v23, v3, 16, v2 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 @@ -454,20 +449,19 @@ ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_mov_b32_e32 v16, v11 ; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX1013-NEXT: v_and_b32_e32 v11, s0, v9 +; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v9 ; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX1013-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX1013-NEXT: v_mov_b32_e32 v17, v12 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX1013-NEXT: v_mov_b32_e32 v18, v13 ; GFX1013-NEXT: v_mov_b32_e32 v19, v14 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6 -; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v11 +; GFX1013-NEXT: v_and_or_b32 v6, v7, 0xffff, v6 +; GFX1013-NEXT: v_and_or_b32 v7, v8, 0xffff, v11 ; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -757,10 +757,9 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s1, 0x80000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, s1 -; GFX6-NEXT: s_bfe_i32 s0, s0, s1 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80000 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x80000 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -66,17 +66,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -66,17 +66,16 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -115,9 +115,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, %amount @@ -627,9 +626,8 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) { ; GFX6-LABEL: lshr_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -655,9 +653,8 @@ define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) { ; GFX6-LABEL: lshr_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -753,9 +750,8 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s3 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -764,14 +760,13 @@ ; ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s1, s2, s4 +; GFX8-NEXT: s_lshr_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -804,10 +799,10 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -840,10 +835,10 @@ ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -940,13 +935,12 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s5 -; GFX6-NEXT: s_and_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s3, s3, s7 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, s6 @@ -957,36 +951,34 @@ ; ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s2, s4, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 -; GFX8-NEXT: s_lshr_b32 s3, s5, s8 +; GFX8-NEXT: s_lshr_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s6 +; GFX9-NEXT: s_lshr_b32 s2, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s5 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 @@ -995,17 +987,16 @@ ; ; GFX10-LABEL: s_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, s6 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s2, s4, s5 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s5, s4 +; GFX10-NEXT: s_lshr_b32 s3, s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog @@ -1120,21 +1111,20 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s16, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s16 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s9 -; GFX6-NEXT: s_and_b32 s3, s3, s16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s2, s2, s16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s5, s5, s16 -; GFX6-NEXT: s_and_b32 s7, s7, s16 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 -; GFX6-NEXT: s_and_b32 s4, s4, s16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_lshr_b32 s5, s5, s13 -; GFX6-NEXT: s_and_b32 s6, s6, s16 +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s7, s7, s15 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 @@ -1149,64 +1139,62 @@ ; ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s4, s8, s12 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 -; GFX8-NEXT: s_lshr_b32 s5, s9, s14 +; GFX8-NEXT: s_lshr_b32 s5, s9, s13 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 -; GFX8-NEXT: s_lshr_b32 s6, s10, s15 +; GFX8-NEXT: s_lshr_b32 s6, s10, s14 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 -; GFX8-NEXT: s_lshr_b32 s7, s11, s16 +; GFX8-NEXT: s_lshr_b32 s7, s11, s15 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s10 +; GFX9-NEXT: s_lshr_b32 s4, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s6, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 ; GFX9-NEXT: s_lshr_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 @@ -1215,26 +1203,25 @@ ; ; GFX10-LABEL: s_lshr_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s8 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s4, s9, s10 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s8 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s8, s9 +; GFX10-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s5, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s5, s9, s10 +; GFX10-NEXT: s_lshr_b32 s5, s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s6, s7, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s7 ; GFX10-NEXT: s_lshr_b32 s5, s5, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -12,25 +12,22 @@ ; ; GFX8-LABEL: s_mul_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -78,29 +75,26 @@ ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16_zeroext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result @@ -146,27 +140,24 @@ ; ; GFX8-LABEL: s_mul_i16_signext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_signext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16_signext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -429,13 +429,12 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog @@ -487,13 +485,12 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s4, s1 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_and_b32 s1, s6, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: @@ -630,18 +626,17 @@ ; GFX6-LABEL: s_orn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -673,18 +668,17 @@ ; GFX6-LABEL: s_orn2_v4i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -716,18 +710,17 @@ ; GFX6-LABEL: s_orn2_v4i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s14, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, s14 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s14 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, s14 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, s14 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, s14 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, s14 +; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -404,10 +404,9 @@ ; GFX10-NEXT: v_rndne_f16_e32 v2, v0 ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_rndne_f16_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) ret <4 x half> %roundeven @@ -610,8 +609,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: s_mov_b32 s7, 0x43300000 ; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 +; GFX6-NEXT: s_mov_b32 s7, 0x43300000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -320,12 +320,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -345,31 +344,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s1, s7, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s6 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s1, s5, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s4 ; GFX6-NEXT: s_add_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -379,32 +375,30 @@ ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, 0 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: s_max_i32 s9, s7, s8 -; GFX8-NEXT: s_min_i32 s7, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, 0 +; GFX8-NEXT: s_max_i32 s7, s5, s6 +; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s7, s6, s7 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s9, s5, s9 -; GFX8-NEXT: s_max_i32 s1, s7, s1 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 +; GFX8-NEXT: s_max_i32 s1, s5, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s7, s9 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_sext_i32_i16 s5, s7 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s7, s3, s8 -; GFX8-NEXT: s_min_i32 s3, s3, s8 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, s7 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 @@ -413,10 +407,9 @@ ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -424,17 +417,16 @@ ; GFX9-LABEL: s_saddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp @@ -451,15 +443,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] @@ -639,24 +630,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -665,9 +654,9 @@ ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -685,59 +674,56 @@ ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s1, s11, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s10 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s1, s9, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s8 ; GFX6-NEXT: s_add_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s2, s10, s2 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s2, s8, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s3, s6, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s5 ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_min_i32 s6, s3, 0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s4, s6, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s5 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -749,48 +735,46 @@ ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, 0 -; GFX8-NEXT: s_movk_i32 s10, 0x8000 -; GFX8-NEXT: s_max_i32 s13, s11, s12 -; GFX8-NEXT: s_min_i32 s11, s11, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, 0 +; GFX8-NEXT: s_max_i32 s11, s9, s10 +; GFX8-NEXT: s_min_i32 s9, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s11, s10, s11 -; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_sub_i32 s9, 0xffff8000, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s13, s9, s13 -; GFX8-NEXT: s_max_i32 s1, s11, s1 +; GFX8-NEXT: s_sub_i32 s11, 0x7fff, s11 +; GFX8-NEXT: s_max_i32 s1, s9, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s11, s13 -; GFX8-NEXT: s_min_i32 s1, s1, s11 +; GFX8-NEXT: s_sext_i32_i16 s9, s11 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s11, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s5, s11 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, s9, s6 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s3, s5, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 @@ -798,35 +782,34 @@ ; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s9, s6 -; GFX8-NEXT: s_max_i32 s4, s5, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_ashr_i32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_add_i32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -838,27 +821,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp @@ -885,39 +867,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1269,19 +1249,17 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s2, s7, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_min_i32 s5, s0, 0 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s2, s5, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, 0 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 @@ -1289,19 +1267,17 @@ ; ; GFX8-LABEL: s_saddsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_min_i32 s7, s0, 0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: s_max_i32 s6, s0, 0 -; GFX8-NEXT: s_sub_i32 s7, s5, s7 -; GFX8-NEXT: s_sub_i32 s6, s4, s6 -; GFX8-NEXT: s_max_i32 s2, s7, s2 -; GFX8-NEXT: s_min_i32 s2, s2, s6 +; GFX8-NEXT: s_min_i32 s5, s0, 0 +; GFX8-NEXT: s_max_i32 s4, s0, 0 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX8-NEXT: s_max_i32 s2, s5, s2 +; GFX8-NEXT: s_min_i32 s2, s2, s4 +; GFX8-NEXT: s_min_i32 s4, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, 0 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_min_i32 s4, s1, 0 -; GFX8-NEXT: s_sub_i32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s2, s3, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -1408,26 +1384,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s7, 1 -; GFX6-NEXT: s_min_i32 s9, s0, 0 -; GFX6-NEXT: s_brev_b32 s6, -2 -; GFX6-NEXT: s_max_i32 s8, s0, 0 -; GFX6-NEXT: s_sub_i32 s9, s7, s9 -; GFX6-NEXT: s_sub_i32 s8, s6, s8 -; GFX6-NEXT: s_max_i32 s3, s9, s3 -; GFX6-NEXT: s_min_i32 s3, s3, s8 -; GFX6-NEXT: s_min_i32 s8, s1, 0 +; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_max_i32 s6, s0, 0 +; GFX6-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX6-NEXT: s_max_i32 s3, s7, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s6 +; GFX6-NEXT: s_min_i32 s6, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s8, s7, s8 -; GFX6-NEXT: s_sub_i32 s3, s6, s3 -; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 +; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s4, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, 0 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s3, s6, s3 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_add_i32 s2, s2, s3 @@ -1435,26 +1409,24 @@ ; ; GFX8-LABEL: s_saddsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s7, 1 -; GFX8-NEXT: s_min_i32 s9, s0, 0 -; GFX8-NEXT: s_brev_b32 s6, -2 -; GFX8-NEXT: s_max_i32 s8, s0, 0 -; GFX8-NEXT: s_sub_i32 s9, s7, s9 -; GFX8-NEXT: s_sub_i32 s8, s6, s8 -; GFX8-NEXT: s_max_i32 s3, s9, s3 -; GFX8-NEXT: s_min_i32 s3, s3, s8 -; GFX8-NEXT: s_min_i32 s8, s1, 0 +; GFX8-NEXT: s_min_i32 s7, s0, 0 +; GFX8-NEXT: s_max_i32 s6, s0, 0 +; GFX8-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX8-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX8-NEXT: s_max_i32 s3, s7, s3 +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_min_i32 s6, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, 0 -; GFX8-NEXT: s_sub_i32 s8, s7, s8 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 -; GFX8-NEXT: s_max_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s4, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, 0 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 +; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_add_i32 s2, s2, s3 @@ -1582,33 +1554,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s4, s11, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s10 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s4, s9, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s8 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_max_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s5, s8, s5 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 @@ -1616,33 +1586,31 @@ ; ; GFX8-LABEL: s_saddsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s9, 1 -; GFX8-NEXT: s_min_i32 s11, s0, 0 -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_max_i32 s10, s0, 0 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 -; GFX8-NEXT: s_sub_i32 s10, s8, s10 -; GFX8-NEXT: s_max_i32 s4, s11, s4 -; GFX8-NEXT: s_min_i32 s4, s4, s10 -; GFX8-NEXT: s_min_i32 s10, s1, 0 +; GFX8-NEXT: s_min_i32 s9, s0, 0 +; GFX8-NEXT: s_max_i32 s8, s0, 0 +; GFX8-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX8-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX8-NEXT: s_max_i32 s4, s9, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s8 +; GFX8-NEXT: s_min_i32 s8, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, 0 -; GFX8-NEXT: s_sub_i32 s10, s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 -; GFX8-NEXT: s_max_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX8-NEXT: s_max_i32 s5, s8, s5 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_add_i32 s3, s3, s4 @@ -1704,21 +1672,20 @@ ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1748,21 +1715,20 @@ ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 @@ -1795,40 +1761,38 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s11, 1 -; GFX6-NEXT: s_min_i32 s13, s0, 0 -; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_max_i32 s12, s0, 0 -; GFX6-NEXT: s_sub_i32 s13, s11, s13 -; GFX6-NEXT: s_sub_i32 s12, s10, s12 -; GFX6-NEXT: s_max_i32 s5, s13, s5 -; GFX6-NEXT: s_min_i32 s5, s5, s12 -; GFX6-NEXT: s_min_i32 s12, s1, 0 +; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_max_i32 s10, s0, 0 +; GFX6-NEXT: s_sub_i32 s11, 0x80000000, s11 +; GFX6-NEXT: s_sub_i32 s10, 0x7fffffff, s10 +; GFX6-NEXT: s_max_i32 s5, s11, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s10 +; GFX6-NEXT: s_min_i32 s10, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s12, s11, s12 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 -; GFX6-NEXT: s_max_i32 s6, s12, s6 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s6, s10, s6 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s8 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s9 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_add_i32 s4, s4, s5 @@ -1836,40 +1800,38 @@ ; ; GFX8-LABEL: s_saddsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s11, 1 -; GFX8-NEXT: s_min_i32 s13, s0, 0 -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_max_i32 s12, s0, 0 -; GFX8-NEXT: s_sub_i32 s13, s11, s13 -; GFX8-NEXT: s_sub_i32 s12, s10, s12 -; GFX8-NEXT: s_max_i32 s5, s13, s5 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_min_i32 s12, s1, 0 +; GFX8-NEXT: s_min_i32 s11, s0, 0 +; GFX8-NEXT: s_max_i32 s10, s0, 0 +; GFX8-NEXT: s_sub_i32 s11, 0x80000000, s11 +; GFX8-NEXT: s_sub_i32 s10, 0x7fffffff, s10 +; GFX8-NEXT: s_max_i32 s5, s11, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_min_i32 s10, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, 0 -; GFX8-NEXT: s_sub_i32 s12, s11, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 -; GFX8-NEXT: s_max_i32 s6, s12, s6 +; GFX8-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX8-NEXT: s_max_i32 s6, s10, s6 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s8 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s4, 0 ; GFX8-NEXT: s_add_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_add_i32 s4, s4, s5 @@ -2203,117 +2165,115 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s33, 1 -; GFX6-NEXT: s_min_i32 s35, s0, 0 -; GFX6-NEXT: s_brev_b32 s32, -2 -; GFX6-NEXT: s_max_i32 s34, s0, 0 -; GFX6-NEXT: s_sub_i32 s35, s33, s35 -; GFX6-NEXT: s_sub_i32 s34, s32, s34 -; GFX6-NEXT: s_max_i32 s16, s35, s16 -; GFX6-NEXT: s_min_i32 s16, s16, s34 -; GFX6-NEXT: s_min_i32 s34, s1, 0 +; GFX6-NEXT: s_min_i32 s33, s0, 0 +; GFX6-NEXT: s_max_i32 s32, s0, 0 +; GFX6-NEXT: s_sub_i32 s33, 0x80000000, s33 +; GFX6-NEXT: s_sub_i32 s32, 0x7fffffff, s32 +; GFX6-NEXT: s_max_i32 s16, s33, s16 +; GFX6-NEXT: s_min_i32 s16, s16, s32 +; GFX6-NEXT: s_min_i32 s32, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, 0 -; GFX6-NEXT: s_sub_i32 s34, s33, s34 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 -; GFX6-NEXT: s_max_i32 s17, s34, s17 +; GFX6-NEXT: s_sub_i32 s32, 0x80000000, s32 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX6-NEXT: s_max_i32 s17, s32, s17 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s18 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s19 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s20 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s5, 0 ; GFX6-NEXT: s_add_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s21 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s6, 0 ; GFX6-NEXT: s_add_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s22 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s7, 0 ; GFX6-NEXT: s_add_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s23 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s8, 0 ; GFX6-NEXT: s_add_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s24 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s9, 0 ; GFX6-NEXT: s_add_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s25 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s10, 0 ; GFX6-NEXT: s_add_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s26 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s11, 0 ; GFX6-NEXT: s_add_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s27 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s12, 0 ; GFX6-NEXT: s_add_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s28 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s13, 0 ; GFX6-NEXT: s_add_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s29 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s14, 0 ; GFX6-NEXT: s_add_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s30 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s15, 0 ; GFX6-NEXT: s_add_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s31 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_add_i32 s15, s15, s16 @@ -2321,117 +2281,115 @@ ; ; GFX8-LABEL: s_saddsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s33, 1 -; GFX8-NEXT: s_min_i32 s35, s0, 0 -; GFX8-NEXT: s_brev_b32 s32, -2 -; GFX8-NEXT: s_max_i32 s34, s0, 0 -; GFX8-NEXT: s_sub_i32 s35, s33, s35 -; GFX8-NEXT: s_sub_i32 s34, s32, s34 -; GFX8-NEXT: s_max_i32 s16, s35, s16 -; GFX8-NEXT: s_min_i32 s16, s16, s34 -; GFX8-NEXT: s_min_i32 s34, s1, 0 +; GFX8-NEXT: s_min_i32 s33, s0, 0 +; GFX8-NEXT: s_max_i32 s32, s0, 0 +; GFX8-NEXT: s_sub_i32 s33, 0x80000000, s33 +; GFX8-NEXT: s_sub_i32 s32, 0x7fffffff, s32 +; GFX8-NEXT: s_max_i32 s16, s33, s16 +; GFX8-NEXT: s_min_i32 s16, s16, s32 +; GFX8-NEXT: s_min_i32 s32, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, 0 -; GFX8-NEXT: s_sub_i32 s34, s33, s34 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 -; GFX8-NEXT: s_max_i32 s17, s34, s17 +; GFX8-NEXT: s_sub_i32 s32, 0x80000000, s32 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX8-NEXT: s_max_i32 s17, s32, s17 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s18 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s19 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s4, 0 ; GFX8-NEXT: s_add_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s20 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s5, 0 ; GFX8-NEXT: s_add_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s21 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s6, 0 ; GFX8-NEXT: s_add_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s22 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s7, 0 ; GFX8-NEXT: s_add_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s23 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s8, 0 ; GFX8-NEXT: s_add_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s24 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s9, 0 ; GFX8-NEXT: s_add_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s25 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s10, 0 ; GFX8-NEXT: s_add_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s26 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s11, 0 ; GFX8-NEXT: s_add_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s27 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s12, 0 ; GFX8-NEXT: s_add_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s28 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s13, 0 ; GFX8-NEXT: s_add_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s29 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s14, 0 ; GFX8-NEXT: s_add_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s30 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s15, 0 ; GFX8-NEXT: s_add_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s31 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_add_i32 s15, s15, s16 @@ -2773,60 +2731,55 @@ ; GFX6-LABEL: s_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s2, s7, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s2, s5, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, 0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: s_max_i32 s8, s6, s7 -; GFX8-NEXT: s_min_i32 s6, s6, s7 -; GFX8-NEXT: s_sub_i32 s6, s5, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, 0 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s8, s4, s8 -; GFX8-NEXT: s_max_i32 s1, s6, s1 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s6, s8 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s6, s1, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s7 -; GFX8-NEXT: s_sub_i32 s1, s5, s1 +; GFX8-NEXT: s_max_i32 s4, s1, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 +; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, s6 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 @@ -2859,22 +2812,20 @@ ; GFX6-LABEL: saddsat_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: s_min_i32 s5, s0, 0 +; GFX6-NEXT: s_min_i32 s3, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_max_i32 s4, s0, 0 -; GFX6-NEXT: s_sub_i32 s5, s3, s5 -; GFX6-NEXT: s_sub_i32 s4, s2, s4 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: s_max_i32 s2, s0, 0 +; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: s_max_i32 s1, s0, 0 -; GFX6-NEXT: s_sub_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s2, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s2, s3, s2 +; GFX6-NEXT: s_max_i32 s1, s0, 0 +; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 +; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 ; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 @@ -2889,25 +2840,23 @@ ; ; GFX8-LABEL: saddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, 0 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, 0 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_sub_i32 s4, s3, s4 -; GFX8-NEXT: s_sub_i32 s6, s2, s6 -; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: v_min_i16_e32 v1, s6, v1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: v_max_i16_e32 v1, s2, v0 +; GFX8-NEXT: s_sext_i32_i16 s2, s1 +; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 +; GFX8-NEXT: v_min_i16_e32 v0, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3129,31 +3078,29 @@ ; GFX6-LABEL: s_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s4, s11, s4 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s4, s9, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s10 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s4, s4, s8 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s4, s10, s4 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s5 @@ -3161,65 +3108,62 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, 0 -; GFX8-NEXT: s_movk_i32 s9, 0x8000 -; GFX8-NEXT: s_max_i32 s12, s10, s11 -; GFX8-NEXT: s_min_i32 s10, s10, s11 -; GFX8-NEXT: s_sub_i32 s10, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, 0 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s12, s8, s12 -; GFX8-NEXT: s_max_i32 s2, s10, s2 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 +; GFX8-NEXT: s_max_i32 s2, s8, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s10, s12 +; GFX8-NEXT: s_sext_i32_i16 s8, s10 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s10 +; GFX8-NEXT: s_min_i32 s2, s2, s8 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s10, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s8, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s10, s8, s10 +; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 ; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s10 +; GFX8-NEXT: s_sext_i32_i16 s6, s8 ; GFX8-NEXT: s_min_i32 s2, s2, s6 ; GFX8-NEXT: s_add_i32 s4, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s1 -; GFX8-NEXT: s_max_i32 s6, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s6, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, s8, s6 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 @@ -3227,12 +3171,12 @@ ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s5 -; GFX8-NEXT: s_max_i32 s3, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s3, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s8, s3 +; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3446,31 +3390,29 @@ ; GFX6-LABEL: s_saddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s13, 1 -; GFX6-NEXT: s_min_i32 s15, s0, 0 +; GFX6-NEXT: s_min_i32 s13, s0, 0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_brev_b32 s12, -2 -; GFX6-NEXT: s_max_i32 s14, s0, 0 -; GFX6-NEXT: s_sub_i32 s15, s13, s15 -; GFX6-NEXT: s_sub_i32 s14, s12, s14 -; GFX6-NEXT: s_max_i32 s6, s15, s6 +; GFX6-NEXT: s_max_i32 s12, s0, 0 +; GFX6-NEXT: s_sub_i32 s13, 0x80000000, s13 +; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12 +; GFX6-NEXT: s_max_i32 s6, s13, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s14 -; GFX6-NEXT: s_min_i32 s14, s1, 0 +; GFX6-NEXT: s_min_i32 s6, s6, s12 +; GFX6-NEXT: s_min_i32 s12, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, 0 -; GFX6-NEXT: s_sub_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 -; GFX6-NEXT: s_max_i32 s6, s14, s6 +; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_max_i32 s6, s12, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s7 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 ; GFX6-NEXT: s_min_i32 s8, s2, 0 ; GFX6-NEXT: s_max_i32 s7, s2, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3478,8 +3420,8 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 ; GFX6-NEXT: s_max_i32 s7, s3, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3487,8 +3429,8 @@ ; GFX6-NEXT: s_add_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_max_i32 s7, s4, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3496,71 +3438,68 @@ ; GFX6-NEXT: s_add_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_max_i32 s7, s5, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 -; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s7 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, 0 -; GFX8-NEXT: s_movk_i32 s13, 0x8000 -; GFX8-NEXT: s_max_i32 s16, s14, s15 -; GFX8-NEXT: s_min_i32 s14, s14, s15 -; GFX8-NEXT: s_sub_i32 s14, s13, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_sext_i32_i16 s13, 0 +; GFX8-NEXT: s_max_i32 s14, s12, s13 +; GFX8-NEXT: s_min_i32 s12, s12, s13 +; GFX8-NEXT: s_sub_i32 s12, 0xffff8000, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_movk_i32 s12, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s16, s12, s16 -; GFX8-NEXT: s_max_i32 s3, s14, s3 +; GFX8-NEXT: s_sub_i32 s14, 0x7fff, s14 +; GFX8-NEXT: s_max_i32 s3, s12, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s14, s16 +; GFX8-NEXT: s_sext_i32_i16 s12, s14 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s14 +; GFX8-NEXT: s_min_i32 s3, s3, s12 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s14, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s12, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s14, s12, s14 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s9, s14 +; GFX8-NEXT: s_sext_i32_i16 s9, s12 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_add_i32 s6, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s9, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s9, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s9, s12, s9 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 @@ -3568,25 +3507,25 @@ ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s7 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s7, s7, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s2 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3594,12 +3533,12 @@ ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s8 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s11 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3853,31 +3792,29 @@ ; GFX6-LABEL: s_saddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s17, 1 -; GFX6-NEXT: s_min_i32 s19, s0, 0 +; GFX6-NEXT: s_min_i32 s17, s0, 0 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_brev_b32 s16, -2 -; GFX6-NEXT: s_max_i32 s18, s0, 0 -; GFX6-NEXT: s_sub_i32 s19, s17, s19 -; GFX6-NEXT: s_sub_i32 s18, s16, s18 -; GFX6-NEXT: s_max_i32 s8, s19, s8 +; GFX6-NEXT: s_max_i32 s16, s0, 0 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX6-NEXT: s_max_i32 s8, s17, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s18 -; GFX6-NEXT: s_min_i32 s18, s1, 0 +; GFX6-NEXT: s_min_i32 s8, s8, s16 +; GFX6-NEXT: s_min_i32 s16, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, 0 -; GFX6-NEXT: s_sub_i32 s18, s17, s18 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 -; GFX6-NEXT: s_max_i32 s8, s18, s8 +; GFX6-NEXT: s_sub_i32 s16, 0x80000000, s16 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_max_i32 s8, s16, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 ; GFX6-NEXT: s_min_i32 s10, s2, 0 ; GFX6-NEXT: s_max_i32 s9, s2, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3885,8 +3822,8 @@ ; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 ; GFX6-NEXT: s_max_i32 s9, s3, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3894,8 +3831,8 @@ ; GFX6-NEXT: s_add_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 ; GFX6-NEXT: s_max_i32 s9, s4, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3903,8 +3840,8 @@ ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_max_i32 s9, s5, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3912,86 +3849,83 @@ ; GFX6-NEXT: s_add_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_max_i32 s9, s6, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s7, 0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_max_i32 s9, s7, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_max_i32 s8, s10, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s9 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, 0 -; GFX8-NEXT: s_movk_i32 s17, 0x8000 -; GFX8-NEXT: s_max_i32 s20, s18, s19 -; GFX8-NEXT: s_min_i32 s18, s18, s19 -; GFX8-NEXT: s_sub_i32 s18, s17, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s0 +; GFX8-NEXT: s_sext_i32_i16 s17, 0 +; GFX8-NEXT: s_max_i32 s18, s16, s17 +; GFX8-NEXT: s_min_i32 s16, s16, s17 +; GFX8-NEXT: s_sub_i32 s16, 0xffff8000, s16 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_movk_i32 s16, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s20, s16, s20 -; GFX8-NEXT: s_max_i32 s4, s18, s4 +; GFX8-NEXT: s_sub_i32 s18, 0x7fff, s18 +; GFX8-NEXT: s_max_i32 s4, s16, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s18, s20 +; GFX8-NEXT: s_sext_i32_i16 s16, s18 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s18 +; GFX8-NEXT: s_min_i32 s4, s4, s16 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s18, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s16, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s18, s16, s18 +; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16 ; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s12, s18 +; GFX8-NEXT: s_sext_i32_i16 s12, s16 ; GFX8-NEXT: s_min_i32 s4, s4, s12 ; GFX8-NEXT: s_add_i32 s8, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s12, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s12, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s12, s16, s12 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s12 @@ -3999,25 +3933,25 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s9, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s2 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4025,24 +3959,24 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s10 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s14 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s10, s10, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s3 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4050,13 +3984,13 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s11 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s15 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4576,9 +4510,8 @@ ; GFX6-NEXT: s_add_u32 s0, s4, 0 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 +; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 @@ -4601,7 +4534,7 @@ ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 @@ -4629,9 +4562,8 @@ ; GFX8-NEXT: s_add_u32 s0, s4, 0 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 +; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 @@ -4654,7 +4586,7 @@ ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -4682,9 +4614,8 @@ ; GFX9-NEXT: s_add_u32 s0, s4, 0 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 +; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -4707,7 +4638,7 @@ ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 @@ -4727,7 +4658,6 @@ ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_ashr_i32 s1, s9, 31 @@ -4738,7 +4668,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 @@ -4757,7 +4687,7 @@ ; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -5595,12 +5525,11 @@ ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s3, 0 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_and_b32 s11, s11, 1 -; GFX6-NEXT: s_brev_b32 s10, 1 -; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b32 s10, 1, 0 +; GFX6-NEXT: s_and_b32 s10, s10, 1 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: s_addc_u32 s3, s3, s10 +; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_add_u32 s0, s4, s12 @@ -5658,7 +5587,7 @@ ; GFX6-NEXT: s_and_b32 s8, s8, 1 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s7, s10 +; GFX6-NEXT: s_addc_u32 s7, s7, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5726,12 +5655,11 @@ ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s2, s3, 0 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_and_b32 s11, s11, 1 -; GFX8-NEXT: s_brev_b32 s10, 1 -; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b32 s10, 1, 0 +; GFX8-NEXT: s_and_b32 s10, s10, 1 +; GFX8-NEXT: s_cmp_lg_u32 s10, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: s_addc_u32 s3, s3, s10 +; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_add_u32 s0, s4, s12 @@ -5795,7 +5723,7 @@ ; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s7, s10 +; GFX8-NEXT: s_addc_u32 s7, s7, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5863,12 +5791,11 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s2, s3, 0 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_and_b32 s11, s11, 1 -; GFX9-NEXT: s_brev_b32 s10, 1 -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b32 s10, 1, 0 +; GFX9-NEXT: s_and_b32 s10, s10, 1 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: s_addc_u32 s3, s3, s10 +; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_add_u32 s0, s4, s12 @@ -5932,7 +5859,7 @@ ; GFX9-NEXT: s_and_b32 s8, s8, 1 ; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s7, s10 +; GFX9-NEXT: s_addc_u32 s7, s7, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -5985,29 +5912,28 @@ ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_ashr_i32 s3, s17, 31 -; GFX10-NEXT: s_and_b32 s0, 1, s0 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 -; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 +; GFX10-NEXT: s_and_b32 s1, 1, s0 ; GFX10-NEXT: s_add_u32 s0, s3, 0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s1 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_and_b32 s2, s2, 1 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 ; GFX10-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_addc_u32 s2, s3, 0 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_cselect_b32 s10, 1, 0 +; GFX10-NEXT: s_and_b32 s10, s10, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: s_and_b32 s11, s11, 1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_addc_u32 s3, s3, s10 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_add_u32 s0, s4, s12 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -6063,7 +5989,7 @@ ; GFX10-NEXT: s_and_b32 s6, s6, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s1, s5, s10 +; GFX10-NEXT: s_addc_u32 s1, s5, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -309,10 +309,10 @@ ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -408,15 +408,15 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc @@ -450,10 +450,10 @@ ; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -549,15 +549,15 @@ ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1194,8 +1194,7 @@ ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s10, 0x1000 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 @@ -1326,7 +1325,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 @@ -1907,8 +1906,7 @@ ; GISEL-LABEL: v_sdiv_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 @@ -2039,7 +2037,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1192,7 +1192,6 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s12, s8, 31 ; GFX10-NEXT: s_ashr_i32 s14, s10, 31 @@ -1206,9 +1205,9 @@ ; GFX10-NEXT: s_add_i32 s9, s11, s15 ; GFX10-NEXT: s_xor_b32 s11, s7, s13 ; GFX10-NEXT: s_xor_b32 s8, s8, s14 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s8 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_xor_b32 s9, s9, s15 ; GFX10-NEXT: s_sub_i32 s6, 0, s10 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s9 @@ -1217,96 +1216,96 @@ ; GFX10-NEXT: s_sub_i32 s7, 0, s11 ; GFX10-NEXT: s_sub_i32 s19, 0, s8 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: s_ashr_i32 s16, s0, 31 -; GFX10-NEXT: s_ashr_i32 s17, s1, 31 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: s_add_i32 s0, s0, s16 -; GFX10-NEXT: s_ashr_i32 s18, s2, 31 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-NEXT: s_ashr_i32 s17, s1, 31 ; GFX10-NEXT: s_xor_b32 s0, s0, s16 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: s_ashr_i32 s18, s2, 31 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 ; GFX10-NEXT: s_sub_i32 s6, 0, s9 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: s_add_i32 s1, s1, s17 ; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX10-NEXT: s_add_i32 s1, s1, s17 ; GFX10-NEXT: s_add_i32 s2, s2, s18 +; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_ashr_i32 s19, s3, 31 ; GFX10-NEXT: s_xor_b32 s1, s1, s17 -; GFX10-NEXT: s_xor_b32 s2, s2, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 -; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 +; GFX10-NEXT: s_xor_b32 s2, s2, s18 ; GFX10-NEXT: s_add_i32 s3, s3, s19 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s3, s3, s19 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX10-NEXT: s_xor_b32 s12, s16, s12 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: s_xor_b32 s13, s17, s13 ; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX10-NEXT: v_mul_hi_u32 v2, s2, v2 -; GFX10-NEXT: v_mul_lo_u32 v4, v0, s10 +; GFX10-NEXT: s_xor_b32 s13, s17, s13 +; GFX10-NEXT: s_xor_b32 s14, s18, s14 ; GFX10-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 -; GFX10-NEXT: s_xor_b32 s14, s18, s14 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s11 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s8 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s9 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s9 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s2, v6 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, s3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s10, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v5 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s8, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s10, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s2, s9, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s11, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s8, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s9, v7 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s10, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s10, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s8, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v7 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s9, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s11, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s8, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v12, s9, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s9, v7 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 -; GFX10-NEXT: s_xor_b32 s0, s19, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo +; GFX10-NEXT: s_xor_b32 s0, s19, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s13, v1 ; GFX10-NEXT: v_xor_b32_e32 v2, s14, v2 @@ -2911,7 +2910,6 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x14 ; GFX8-NEXT: s_load_dword s8, s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s9, 0x100010 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s1, s0 ; GFX8-NEXT: s_ashr_i32 s2, s1, 31 @@ -2920,18 +2918,18 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX8-NEXT: s_sub_i32 s6, 0, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s8 -; GFX8-NEXT: s_bfe_i32 s0, s0, s9 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_ashr_i32 s10, s1, 31 -; GFX8-NEXT: s_ashr_i32 s11, s0, 31 -; GFX8-NEXT: s_add_i32 s1, s1, s10 +; GFX8-NEXT: s_ashr_i32 s9, s1, 31 +; GFX8-NEXT: s_ashr_i32 s10, s0, 31 +; GFX8-NEXT: s_add_i32 s1, s1, s9 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s0, s11 -; GFX8-NEXT: s_xor_b32 s1, s1, s10 -; GFX8-NEXT: s_xor_b32 s12, s0, s11 +; GFX8-NEXT: s_add_i32 s0, s0, s10 +; GFX8-NEXT: s_xor_b32 s1, s1, s9 +; GFX8-NEXT: s_xor_b32 s11, s0, s10 ; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s12 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -2950,11 +2948,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 -; GFX8-NEXT: s_sub_i32 s1, 0, s12 +; GFX8-NEXT: s_sub_i32 s1, 0, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX8-NEXT: s_bfe_i32 s1, s8, s9 -; GFX8-NEXT: s_xor_b32 s0, s10, s2 +; GFX8-NEXT: s_bfe_i32 s1, s8, 0x100010 +; GFX8-NEXT: s_xor_b32 s0, s9, s2 ; GFX8-NEXT: s_ashr_i32 s2, s1, 31 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -2962,21 +2960,21 @@ ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s10, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s12 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s9, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s11 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: s_xor_b32 s0, s2, s10 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 @@ -3000,77 +2998,75 @@ ; ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x14 +; GFX9-NEXT: s_load_dword s8, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: s_ashr_i32 s7, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s7 -; GFX9-NEXT: s_xor_b32 s8, s0, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s4, 0x100010 -; GFX9-NEXT: s_bfe_i32 s6, s6, s4 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_i32 s1, s1, s6 +; GFX9-NEXT: s_xor_b32 s7, s1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX9-NEXT: s_ashr_i32 s9, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s10, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s10 -; GFX9-NEXT: s_sub_i32 s11, 0, s8 +; GFX9-NEXT: s_xor_b32 s10, s0, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s6, s6, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s5, s9 -; GFX9-NEXT: v_mul_lo_u32 v1, s11, v0 -; GFX9-NEXT: s_ashr_i32 s11, s5, 31 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_add_i32 s5, s5, s11 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: s_xor_b32 s5, s5, s11 -; GFX9-NEXT: s_bfe_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s9, 0, s6 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_sext_i32_i16 s1, s8 +; GFX9-NEXT: s_ashr_i32 s11, s1, 31 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX9-NEXT: s_xor_b32 s7, s11, s7 -; GFX9-NEXT: v_mul_lo_u32 v3, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 -; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: s_add_i32 s1, s1, s11 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_xor_b32 s12, s1, s11 +; GFX9-NEXT: s_sub_i32 s0, 0, s10 +; GFX9-NEXT: s_xor_b32 s6, s11, s6 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s4, s8, 0x100010 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX9-NEXT: s_ashr_i32 s5, s4, 31 ; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: v_sub_u32_e32 v3, s12, v3 +; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: s_xor_b32 s4, s4, s5 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s10 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: s_xor_b32 s4, s5, s10 -; GFX9-NEXT: v_xor_b32_e32 v0, s7, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX9-NEXT: s_xor_b32 s4, s5, s9 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v2, s11, v2 ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3078,6 +3074,7 @@ ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: global_store_dword v2, v1, s[2:3] ; GFX9-NEXT: s_endpgm @@ -3085,21 +3082,20 @@ ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x14 -; GFX10-NEXT: s_mov_b32 s1, 0x100010 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_bfe_i32 s0, s0, s1 -; GFX10-NEXT: s_ashr_i32 s3, s2, 31 -; GFX10-NEXT: s_ashr_i32 s8, s0, 31 -; GFX10-NEXT: s_add_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s0, s0, s8 -; GFX10-NEXT: s_xor_b32 s2, s2, s3 -; GFX10-NEXT: s_xor_b32 s9, s0, s8 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX10-NEXT: s_sext_i32_i16 s1, s0 +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX10-NEXT: s_ashr_i32 s2, s1, 31 +; GFX10-NEXT: s_ashr_i32 s3, s0, 31 +; GFX10-NEXT: s_add_i32 s1, s1, s2 +; GFX10-NEXT: s_add_i32 s0, s0, s3 +; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s8, s0, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 -; GFX10-NEXT: s_sub_i32 s7, 0, s9 +; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: s_sub_i32 s7, 0, s8 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -3110,58 +3106,57 @@ ; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s6, s0 -; GFX10-NEXT: s_bfe_i32 s0, s0, s1 -; GFX10-NEXT: s_ashr_i32 s1, s6, 31 +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: s_add_i32 s6, s6, s1 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s1 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s9 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s8 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s9, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s8, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s9, v3 -; GFX10-NEXT: s_xor_b32 s2, s1, s3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s8, v3 +; GFX10-NEXT: s_xor_b32 s1, s9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s10, s8 -; GFX10-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_xor_b32 s0, s10, s3 +; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s1, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s1, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] @@ -3463,9 +3458,8 @@ ; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 -; GFX10-NEXT: s_mov_b32 s4, 0x7ffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -439,9 +439,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_brev_b32 s4, -4 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0x3fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5] @@ -523,9 +522,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_brev_b32 s4, -8 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -608,13 +606,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; GFX7-LABEL: s_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2 ; GFX7-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -781,25 +781,23 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -865,10 +863,10 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: shl_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff +; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 @@ -966,39 +964,37 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s2, s4, s7 +; GFX8-NEXT: s_lshl_b32 s2, s4, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 -; GFX8-NEXT: s_lshl_b32 s3, s5, s8 +; GFX8-NEXT: s_lshl_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1140,67 +1136,65 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s16, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, s10 ; GFX6-NEXT: s_lshl_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshl_b32 s5, s5, s13 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s16 -; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, s12 ; GFX6-NEXT: s_lshl_b32 s7, s7, s15 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s4, s8, s13 +; GFX8-NEXT: s_lshl_b32 s4, s8, s12 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 -; GFX8-NEXT: s_lshl_b32 s5, s9, s14 +; GFX8-NEXT: s_lshl_b32 s5, s9, s13 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s6, s10, s15 +; GFX8-NEXT: s_lshl_b32 s6, s10, s14 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 -; GFX8-NEXT: s_lshl_b32 s7, s11, s16 +; GFX8-NEXT: s_lshl_b32 s7, s11, s15 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -377,13 +377,13 @@ ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 @@ -508,13 +508,13 @@ ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1172,8 +1172,7 @@ ; GISEL-LABEL: v_srem_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s10, 0x1000 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 @@ -1305,7 +1304,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 @@ -1877,8 +1876,7 @@ ; GISEL-LABEL: v_srem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: s_and_b32 s5, s5, 1 ; GISEL-NEXT: s_mov_b32 s6, 0 @@ -2010,7 +2008,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; GISEL-NEXT: s_cselect_b32 s5, 1, 0 ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -320,12 +320,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -345,31 +344,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s1, s6, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s1, s4, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -379,32 +375,30 @@ ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, -1 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_max_i32 s9, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, -1 +; GFX8-NEXT: s_max_i32 s7, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s9, s9, s5 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: s_min_i32 s7, s7, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s6 +; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s7, s7, s6 -; GFX8-NEXT: s_max_i32 s1, s9, s1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s7, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s7, s3, s8 -; GFX8-NEXT: s_sub_i32 s5, s7, s5 -; GFX8-NEXT: s_min_i32 s3, s3, s8 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -413,10 +407,9 @@ ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -424,17 +417,16 @@ ; GFX9-LABEL: s_ssubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp @@ -451,15 +443,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] @@ -639,24 +630,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -665,9 +654,9 @@ ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -685,59 +674,56 @@ ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s1, s10, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s11 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s1, s8, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s5, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s10 +; GFX6-NEXT: s_min_i32 s2, s2, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s5, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_max_i32 s5, s3, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 -; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s6 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -749,35 +735,33 @@ ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, -1 -; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_max_i32 s13, s11, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, -1 +; GFX8-NEXT: s_max_i32 s11, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s13, s13, s9 -; GFX8-NEXT: s_movk_i32 s10, 0x8000 -; GFX8-NEXT: s_min_i32 s11, s11, s12 -; GFX8-NEXT: s_sext_i32_i16 s13, s13 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x7fff +; GFX8-NEXT: s_min_i32 s9, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s11, s11 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s11, s11, s10 -; GFX8-NEXT: s_max_i32 s1, s13, s1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s11, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s11, s11 -; GFX8-NEXT: s_min_i32 s1, s1, s11 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s11, s5, s12 -; GFX8-NEXT: s_sub_i32 s11, s11, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_max_i32 s2, s11, s2 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s2, s9, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s5 @@ -785,12 +769,12 @@ ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_sub_i32 s6, s6, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -798,35 +782,34 @@ ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_sub_i32 s6, s6, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_max_i32 s4, s6, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_ashr_i32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_sub_i32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -838,27 +821,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp @@ -885,39 +867,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1255,19 +1235,17 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s2, s6, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_max_i32 s4, s0, -1 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s2, s4, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s2, s3 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -1275,19 +1253,17 @@ ; ; GFX8-LABEL: s_ssubsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: s_max_i32 s6, s0, -1 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_sub_i32 s6, s6, s4 -; GFX8-NEXT: s_min_i32 s7, s0, -1 -; GFX8-NEXT: s_sub_i32 s7, s7, s5 -; GFX8-NEXT: s_max_i32 s2, s6, s2 -; GFX8-NEXT: s_min_i32 s2, s2, s7 +; GFX8-NEXT: s_max_i32 s4, s0, -1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_min_i32 s5, s0, -1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_max_i32 s2, s4, s2 +; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, s4 +; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff ; GFX8-NEXT: s_min_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -1394,26 +1370,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s6, -2 -; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_brev_b32 s7, 1 -; GFX6-NEXT: s_sub_i32 s8, s8, s6 -; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, s7 -; GFX6-NEXT: s_max_i32 s3, s8, s3 -; GFX6-NEXT: s_min_i32 s3, s3, s9 +; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX6-NEXT: s_min_i32 s7, s0, -1 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX6-NEXT: s_max_i32 s3, s6, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s7 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_min_i32 s6, s1, -1 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s4 -; GFX6-NEXT: s_min_i32 s3, s3, s8 +; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s5 ; GFX6-NEXT: s_min_i32 s3, s3, s4 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -1421,26 +1395,24 @@ ; ; GFX8-LABEL: s_ssubsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s6, -2 -; GFX8-NEXT: s_max_i32 s8, s0, -1 -; GFX8-NEXT: s_brev_b32 s7, 1 -; GFX8-NEXT: s_sub_i32 s8, s8, s6 -; GFX8-NEXT: s_min_i32 s9, s0, -1 -; GFX8-NEXT: s_sub_i32 s9, s9, s7 -; GFX8-NEXT: s_max_i32 s3, s8, s3 -; GFX8-NEXT: s_min_i32 s3, s3, s9 +; GFX8-NEXT: s_max_i32 s6, s0, -1 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX8-NEXT: s_min_i32 s7, s0, -1 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX8-NEXT: s_max_i32 s3, s6, s3 +; GFX8-NEXT: s_min_i32 s3, s3, s7 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 -; GFX8-NEXT: s_min_i32 s8, s1, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, s7 +; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s1, -1 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s4 -; GFX8-NEXT: s_min_i32 s3, s3, s8 +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX8-NEXT: s_min_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s7 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 @@ -1568,33 +1540,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s4, s10, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s11 +; GFX6-NEXT: s_max_i32 s8, s0, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s5 -; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX6-NEXT: s_min_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s6 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX6-NEXT: s_min_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s7 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 @@ -1602,33 +1572,31 @@ ; ; GFX8-LABEL: s_ssubsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_max_i32 s10, s0, -1 -; GFX8-NEXT: s_brev_b32 s9, 1 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 -; GFX8-NEXT: s_min_i32 s11, s0, -1 -; GFX8-NEXT: s_sub_i32 s11, s11, s9 -; GFX8-NEXT: s_max_i32 s4, s10, s4 -; GFX8-NEXT: s_min_i32 s4, s4, s11 +; GFX8-NEXT: s_max_i32 s8, s0, -1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX8-NEXT: s_min_i32 s9, s0, -1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX8-NEXT: s_max_i32 s4, s8, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 -; GFX8-NEXT: s_min_i32 s10, s1, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_min_i32 s8, s1, -1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s10 +; GFX8-NEXT: s_min_i32 s4, s4, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX8-NEXT: s_min_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s9 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX8-NEXT: s_min_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s9 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s7 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 @@ -1694,17 +1662,16 @@ ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1738,17 +1705,16 @@ ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -1781,40 +1747,38 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_max_i32 s12, s0, -1 -; GFX6-NEXT: s_brev_b32 s11, 1 -; GFX6-NEXT: s_sub_i32 s12, s12, s10 -; GFX6-NEXT: s_min_i32 s13, s0, -1 -; GFX6-NEXT: s_sub_i32 s13, s13, s11 -; GFX6-NEXT: s_max_i32 s5, s12, s5 -; GFX6-NEXT: s_min_i32 s5, s5, s13 +; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX6-NEXT: s_min_i32 s11, s0, -1 +; GFX6-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX6-NEXT: s_max_i32 s5, s10, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 -; GFX6-NEXT: s_min_i32 s12, s1, -1 -; GFX6-NEXT: s_sub_i32 s12, s12, s11 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s10, s1, -1 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s6 -; GFX6-NEXT: s_min_i32 s5, s5, s12 +; GFX6-NEXT: s_min_i32 s5, s5, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s8 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s4, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s9 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s4, s4, s5 @@ -1822,40 +1786,38 @@ ; ; GFX8-LABEL: s_ssubsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_max_i32 s12, s0, -1 -; GFX8-NEXT: s_brev_b32 s11, 1 -; GFX8-NEXT: s_sub_i32 s12, s12, s10 -; GFX8-NEXT: s_min_i32 s13, s0, -1 -; GFX8-NEXT: s_sub_i32 s13, s13, s11 -; GFX8-NEXT: s_max_i32 s5, s12, s5 -; GFX8-NEXT: s_min_i32 s5, s5, s13 +; GFX8-NEXT: s_max_i32 s10, s0, -1 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX8-NEXT: s_min_i32 s11, s0, -1 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX8-NEXT: s_max_i32 s5, s10, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s11 ; GFX8-NEXT: s_sub_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_min_i32 s12, s1, -1 -; GFX8-NEXT: s_sub_i32 s12, s12, s11 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_min_i32 s10, s1, -1 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s6 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s2, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s3, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s8 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s4, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s9 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s4, s4, s5 @@ -2189,117 +2151,115 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s32, -2 -; GFX6-NEXT: s_max_i32 s34, s0, -1 -; GFX6-NEXT: s_brev_b32 s33, 1 -; GFX6-NEXT: s_sub_i32 s34, s34, s32 -; GFX6-NEXT: s_min_i32 s35, s0, -1 -; GFX6-NEXT: s_sub_i32 s35, s35, s33 -; GFX6-NEXT: s_max_i32 s16, s34, s16 -; GFX6-NEXT: s_min_i32 s16, s16, s35 +; GFX6-NEXT: s_max_i32 s32, s0, -1 +; GFX6-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX6-NEXT: s_min_i32 s33, s0, -1 +; GFX6-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX6-NEXT: s_max_i32 s16, s32, s16 +; GFX6-NEXT: s_min_i32 s16, s16, s33 ; GFX6-NEXT: s_sub_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 -; GFX6-NEXT: s_min_i32 s34, s1, -1 -; GFX6-NEXT: s_sub_i32 s34, s34, s33 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_min_i32 s32, s1, -1 +; GFX6-NEXT: s_sub_i32 s32, s32, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s17 -; GFX6-NEXT: s_min_i32 s16, s16, s34 +; GFX6-NEXT: s_min_i32 s16, s16, s32 ; GFX6-NEXT: s_sub_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s2, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s18 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s3, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s19 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s4, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s20 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s5, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s21 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s6, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s22 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s7, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s23 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s8, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s24 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s9, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s25 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s10, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s26 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s11, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s27 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s12, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s28 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s13, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s29 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s14, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s30 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s15, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s31 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s15, s15, s16 @@ -2307,117 +2267,115 @@ ; ; GFX8-LABEL: s_ssubsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s32, -2 -; GFX8-NEXT: s_max_i32 s34, s0, -1 -; GFX8-NEXT: s_brev_b32 s33, 1 -; GFX8-NEXT: s_sub_i32 s34, s34, s32 -; GFX8-NEXT: s_min_i32 s35, s0, -1 -; GFX8-NEXT: s_sub_i32 s35, s35, s33 -; GFX8-NEXT: s_max_i32 s16, s34, s16 -; GFX8-NEXT: s_min_i32 s16, s16, s35 +; GFX8-NEXT: s_max_i32 s32, s0, -1 +; GFX8-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX8-NEXT: s_min_i32 s33, s0, -1 +; GFX8-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX8-NEXT: s_max_i32 s16, s32, s16 +; GFX8-NEXT: s_min_i32 s16, s16, s33 ; GFX8-NEXT: s_sub_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 -; GFX8-NEXT: s_min_i32 s34, s1, -1 -; GFX8-NEXT: s_sub_i32 s34, s34, s33 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_min_i32 s32, s1, -1 +; GFX8-NEXT: s_sub_i32 s32, s32, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s17 -; GFX8-NEXT: s_min_i32 s16, s16, s34 +; GFX8-NEXT: s_min_i32 s16, s16, s32 ; GFX8-NEXT: s_sub_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s2, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s18 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s3, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s19 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s4, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s20 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s5, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s21 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s6, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s22 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s7, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s23 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s8, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s24 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s9, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s25 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s10, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s26 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s11, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s27 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s12, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s28 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s13, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s29 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s14, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s30 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s15, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s31 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s15, s15, s16 @@ -2759,60 +2717,55 @@ ; GFX6-LABEL: s_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s2, s6, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s2, s4, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, -1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: s_max_i32 s8, s6, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, -1 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: s_min_i32 s6, s6, s7 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s6, s6, s5 -; GFX8-NEXT: s_max_i32 s1, s8, s1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s6, s1, s7 -; GFX8-NEXT: s_sub_i32 s4, s6, s4 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_max_i32 s4, s1, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 @@ -2845,22 +2798,20 @@ ; GFX6-LABEL: ssubsat_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_max_i32 s4, s0, -1 +; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: s_sub_i32 s4, s4, s2 -; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s3 -; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_min_i32_e32 v0, s5, v0 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX6-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -2875,25 +2826,23 @@ ; ; GFX8-LABEL: ssubsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, -1 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: s_sub_i32 s6, s6, s2 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, -1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_sub_i32 s4, s4, s3 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v0 -; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_sub_i32 s2, s6, s2 -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_sub_i32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 +; GFX8-NEXT: v_min_i16_e32 v1, s2, v1 +; GFX8-NEXT: s_sext_i32_i16 s2, s1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 +; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3115,97 +3064,92 @@ ; GFX6-LABEL: s_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s4, s10, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, -1 -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: s_max_i32 s12, s10, s11 -; GFX8-NEXT: s_sub_i32 s12, s12, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, -1 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_movk_i32 s9, 0x8000 -; GFX8-NEXT: s_min_i32 s10, s10, s11 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 +; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s10, s10, s9 -; GFX8-NEXT: s_max_i32 s2, s12, s2 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_max_i32 s2, s10, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s10 +; GFX8-NEXT: s_min_i32 s2, s2, s8 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s10, s2, s11 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_max_i32 s8, s2, s9 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s9 -; GFX8-NEXT: s_max_i32 s6, s10, s6 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_max_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s2, s6, s2 ; GFX8-NEXT: s_sub_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s6, s4, s11 -; GFX8-NEXT: s_sub_i32 s6, s6, s8 +; GFX8-NEXT: s_max_i32 s6, s4, s9 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s11 +; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3213,12 +3157,12 @@ ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_max_i32 s4, s3, s11 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 -; GFX8-NEXT: s_min_i32 s3, s3, s11 +; GFX8-NEXT: s_max_i32 s4, s3, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s9 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3432,121 +3376,116 @@ ; GFX6-LABEL: s_ssubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s12, -2 -; GFX6-NEXT: s_max_i32 s14, s0, -1 +; GFX6-NEXT: s_max_i32 s12, s0, -1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_brev_b32 s13, 1 -; GFX6-NEXT: s_sub_i32 s14, s14, s12 -; GFX6-NEXT: s_min_i32 s15, s0, -1 -; GFX6-NEXT: s_sub_i32 s15, s15, s13 -; GFX6-NEXT: s_max_i32 s6, s14, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s15 +; GFX6-NEXT: s_sub_i32 s12, s12, 0x7fffffff +; GFX6-NEXT: s_min_i32 s13, s0, -1 +; GFX6-NEXT: s_sub_i32 s13, s13, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s12, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s13 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 -; GFX6-NEXT: s_min_i32 s14, s1, -1 -; GFX6-NEXT: s_sub_i32 s14, s14, s13 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_min_i32 s12, s1, -1 +; GFX6-NEXT: s_sub_i32 s12, s12, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s14 +; GFX6-NEXT: s_min_i32 s6, s6, s12 ; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s2, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s3, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s4, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s5, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 -; GFX6-NEXT: s_max_i32 s6, s7, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, -1 -; GFX8-NEXT: s_movk_i32 s12, 0x7fff -; GFX8-NEXT: s_max_i32 s16, s14, s15 -; GFX8-NEXT: s_sub_i32 s16, s16, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_sext_i32_i16 s13, -1 +; GFX8-NEXT: s_max_i32 s14, s12, s13 +; GFX8-NEXT: s_sub_i32 s14, s14, 0x7fff ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_movk_i32 s13, 0x8000 -; GFX8-NEXT: s_min_i32 s14, s14, s15 -; GFX8-NEXT: s_sext_i32_i16 s16, s16 +; GFX8-NEXT: s_min_i32 s12, s12, s13 +; GFX8-NEXT: s_sext_i32_i16 s14, s14 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s14, s14, s13 -; GFX8-NEXT: s_max_i32 s3, s16, s3 +; GFX8-NEXT: s_sub_i32 s12, s12, 0xffff8000 +; GFX8-NEXT: s_max_i32 s3, s14, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s14 +; GFX8-NEXT: s_min_i32 s3, s3, s12 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s14, s3, s15 -; GFX8-NEXT: s_sub_i32 s14, s14, s12 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_max_i32 s12, s3, s13 +; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s3, s3, s13 -; GFX8-NEXT: s_max_i32 s9, s14, s9 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_max_i32 s9, s12, s9 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s3, s9, s3 ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_max_i32 s9, s6, s15 -; GFX8-NEXT: s_sub_i32 s9, s9, s12 +; GFX8-NEXT: s_max_i32 s9, s6, s13 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_min_i32 s6, s6, s15 +; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3554,25 +3493,25 @@ ; GFX8-NEXT: s_min_i32 s4, s4, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s7 -; GFX8-NEXT: s_max_i32 s6, s4, s15 -; GFX8-NEXT: s_sub_i32 s6, s6, s12 -; GFX8-NEXT: s_min_i32 s4, s4, s15 +; GFX8-NEXT: s_max_i32 s6, s4, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s4, s13 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s2 ; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_max_i32 s7, s6, s15 -; GFX8-NEXT: s_sub_i32 s7, s7, s12 +; GFX8-NEXT: s_max_i32 s7, s6, s13 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_min_i32 s6, s6, s15 +; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3580,12 +3519,12 @@ ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s15 -; GFX8-NEXT: s_sub_i32 s6, s6, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s15 +; GFX8-NEXT: s_max_i32 s6, s5, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_sub_i32 s5, s5, s13 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3839,145 +3778,140 @@ ; GFX6-LABEL: s_ssubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s16, -2 -; GFX6-NEXT: s_max_i32 s18, s0, -1 +; GFX6-NEXT: s_max_i32 s16, s0, -1 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_brev_b32 s17, 1 -; GFX6-NEXT: s_sub_i32 s18, s18, s16 -; GFX6-NEXT: s_min_i32 s19, s0, -1 -; GFX6-NEXT: s_sub_i32 s19, s19, s17 -; GFX6-NEXT: s_max_i32 s8, s18, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s19 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_min_i32 s17, s0, -1 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_max_i32 s8, s16, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 -; GFX6-NEXT: s_min_i32 s18, s1, -1 -; GFX6-NEXT: s_sub_i32 s18, s18, s17 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_min_i32 s16, s1, -1 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s18 +; GFX6-NEXT: s_min_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s2, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s3, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s4, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s5, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s6, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s7, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s7, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 -; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_max_i32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, -1 -; GFX8-NEXT: s_movk_i32 s16, 0x7fff -; GFX8-NEXT: s_max_i32 s20, s18, s19 -; GFX8-NEXT: s_sub_i32 s20, s20, s16 +; GFX8-NEXT: s_sext_i32_i16 s16, s0 +; GFX8-NEXT: s_sext_i32_i16 s17, -1 +; GFX8-NEXT: s_max_i32 s18, s16, s17 +; GFX8-NEXT: s_sub_i32 s18, s18, 0x7fff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_movk_i32 s17, 0x8000 -; GFX8-NEXT: s_min_i32 s18, s18, s19 -; GFX8-NEXT: s_sext_i32_i16 s20, s20 +; GFX8-NEXT: s_min_i32 s16, s16, s17 +; GFX8-NEXT: s_sext_i32_i16 s18, s18 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s18, s18, s17 -; GFX8-NEXT: s_max_i32 s4, s20, s4 +; GFX8-NEXT: s_sub_i32 s16, s16, 0xffff8000 +; GFX8-NEXT: s_max_i32 s4, s18, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s18 +; GFX8-NEXT: s_min_i32 s4, s4, s16 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s18, s4, s19 -; GFX8-NEXT: s_sub_i32 s18, s18, s16 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_max_i32 s16, s4, s17 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s4, s4, s17 -; GFX8-NEXT: s_max_i32 s12, s18, s12 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s12, s16, s12 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s4, s12, s4 ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s8, s1 -; GFX8-NEXT: s_max_i32 s12, s8, s19 -; GFX8-NEXT: s_sub_i32 s12, s12, s16 +; GFX8-NEXT: s_max_i32 s12, s8, s17 +; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3985,25 +3919,25 @@ ; GFX8-NEXT: s_min_i32 s5, s5, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s9 -; GFX8-NEXT: s_max_i32 s8, s5, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_min_i32 s5, s5, s19 +; GFX8-NEXT: s_max_i32 s8, s5, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s5, s17 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s12 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s5, s8, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s2 ; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_max_i32 s9, s8, s19 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 +; GFX8-NEXT: s_max_i32 s9, s8, s17 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4011,24 +3945,24 @@ ; GFX8-NEXT: s_min_i32 s6, s6, s8 ; GFX8-NEXT: s_sub_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s10 -; GFX8-NEXT: s_max_i32 s8, s6, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_min_i32 s6, s6, s19 +; GFX8-NEXT: s_max_i32 s8, s6, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s6, s6, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s14 -; GFX8-NEXT: s_sub_i32 s6, s6, s17 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_min_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s3 -; GFX8-NEXT: s_max_i32 s9, s8, s19 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 +; GFX8-NEXT: s_max_i32 s9, s8, s17 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4036,15 +3970,15 @@ ; GFX8-NEXT: s_min_i32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s3, s3, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_max_i32 s8, s7, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 +; GFX8-NEXT: s_max_i32 s8, s7, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_min_i32 s7, s7, s19 +; GFX8-NEXT: s_min_i32 s7, s7, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s15 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_sub_i32 s7, s7, s17 +; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 @@ -4562,9 +4496,8 @@ ; GFX6-NEXT: s_add_u32 s0, s4, 0 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: s_brev_b32 s5, 1 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 +; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 @@ -4587,7 +4520,7 @@ ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: v_mov_b32_e32 v5, s1 @@ -4615,9 +4548,8 @@ ; GFX8-NEXT: s_add_u32 s0, s4, 0 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_brev_b32 s5, 1 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 +; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 @@ -4640,7 +4572,7 @@ ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mov_b32_e32 v5, s1 @@ -4668,9 +4600,8 @@ ; GFX9-NEXT: s_add_u32 s0, s4, 0 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_brev_b32 s5, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 +; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 @@ -4693,7 +4624,7 @@ ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 @@ -4713,7 +4644,6 @@ ; GFX10-NEXT: s_and_b32 s9, s9, 1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: s_cmp_lg_u32 s9, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX10-NEXT: s_ashr_i32 s1, s9, 31 @@ -4724,7 +4654,7 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 ; GFX10-NEXT: s_and_b32 s4, s4, 1 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: s_sub_u32 s4, s2, s6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 @@ -4743,7 +4673,7 @@ ; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s10 +; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -5622,13 +5552,12 @@ ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s3, 0 -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 ; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_brev_b32 s8, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_addc_u32 s3, s3, s8 +; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_sub_u32 s0, s4, s12 @@ -5683,12 +5612,12 @@ ; GFX6-NEXT: s_cmp_lg_u32 s6, 0 ; GFX6-NEXT: s_addc_u32 s6, s7, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s7, s8 +; GFX6-NEXT: s_addc_u32 s7, s7, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5758,12 +5687,11 @@ ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: s_addc_u32 s2, s3, 0 -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_brev_b32 s8, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_addc_u32 s3, s3, s8 +; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_sub_u32 s0, s4, s12 @@ -5824,12 +5752,12 @@ ; GFX8-NEXT: s_cmp_lg_u32 s6, 0 ; GFX8-NEXT: s_addc_u32 s6, s7, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_and_b32 s8, s8, 1 +; GFX8-NEXT: s_cmp_lg_u32 s8, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s7, s8 +; GFX8-NEXT: s_addc_u32 s7, s7, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5899,12 +5827,11 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: s_addc_u32 s2, s3, 0 -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_brev_b32 s8, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_addc_u32 s3, s3, s8 +; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_sub_u32 s0, s4, s12 @@ -5965,12 +5892,12 @@ ; GFX9-NEXT: s_cmp_lg_u32 s6, 0 ; GFX9-NEXT: s_addc_u32 s6, s7, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_and_b32 s8, s8, 1 +; GFX9-NEXT: s_cmp_lg_u32 s8, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s7, s8 +; GFX9-NEXT: s_addc_u32 s7, s7, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -6022,18 +5949,17 @@ ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: s_ashr_i32 s3, s19, 31 -; GFX10-NEXT: s_and_b32 s0, 1, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[10:11], 0 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_add_u32 s0, s3, 0 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s1, 1, s0 +; GFX10-NEXT: s_add_u32 s0, s3, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-NEXT: s_and_b32 s2, s2, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s2, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_cselect_b32 s2, 1, 0 @@ -6047,7 +5973,7 @@ ; GFX10-NEXT: s_and_b32 s8, s8, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_cmp_lg_u32 s8, 0 -; GFX10-NEXT: s_addc_u32 s3, s3, s10 +; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo ; GFX10-NEXT: s_sub_u32 s0, s4, s12 ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 @@ -6105,7 +6031,7 @@ ; GFX10-NEXT: s_and_b32 s6, s6, 1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 ; GFX10-NEXT: s_cmp_lg_u32 s6, 0 -; GFX10-NEXT: s_addc_u32 s1, s5, s10 +; GFX10-NEXT: s_addc_u32 s1, s5, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll @@ -162,23 +162,21 @@ define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) { ; GFX7-LABEL: s_trunc_v4i32_to_v4i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_trunc_v4i32_to_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog %trunc = trunc <4 x i32> %src to <4 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -242,12 +242,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -305,17 +304,16 @@ ; GFX9-LABEL: s_uaddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp @@ -332,15 +330,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -468,24 +465,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -494,9 +489,9 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -592,27 +587,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp @@ -639,39 +633,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -926,7 +926,6 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 @@ -941,9 +940,9 @@ ; GFX10-NEXT: s_sub_i32 s7, 0, s9 ; GFX10-NEXT: s_sub_i32 s12, 0, s10 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2308,47 +2307,47 @@ ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x14 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s0, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_and_b32 s2, s0, 0xffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s6 +; GFX8-NEXT: s_sub_i32 s1, 0, s2 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 16 ; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 ; GFX8-NEXT: s_sub_i32 s1, 0, s3 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s4, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s6 +; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s6, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s5, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -2358,11 +2357,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s8 @@ -2375,31 +2374,30 @@ ; ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x14 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s0 +; GFX9-NEXT: s_and_b32 s7, s0, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s2, 0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x10 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x10 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s1, s0 +; GFX9-NEXT: s_and_b32 s9, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 @@ -2441,43 +2439,42 @@ ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x14 -; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s1, s0, 16 -; GFX10-NEXT: s_and_b32 s3, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s0, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 ; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s3 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 @@ -2486,19 +2483,17 @@ ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s2, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v1, v4, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] ; GFX10-NEXT: s_endpgm @@ -2631,39 +2626,39 @@ ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s7, s1, s6 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX8-NEXT: s_sub_i32 s1, 0, s7 -; GFX8-NEXT: s_and_b32 s8, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX8-NEXT: s_sub_i32 s1, 0, s6 +; GFX8-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, s7, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mul_lo_u32 v3, v2, s7 +; GFX8-NEXT: v_mul_lo_u32 v3, v2, s6 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2671,49 +2666,48 @@ ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 -; GFX9-NEXT: s_and_b32 s8, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s7, s1, s6 -; GFX10-NEXT: s_and_b32 s0, s0, s6 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2721,22 +2715,22 @@ ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -272,13 +272,13 @@ ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1081,13 +1081,13 @@ ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, s6, v3 -; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc +; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, s6, v3 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; CHECK-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc @@ -1101,232 +1101,232 @@ ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GISEL-NEXT: s_sub_u32 s6, 0, s8 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: v_mov_b32_e32 v6, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GISEL-NEXT: s_and_b32 s4, s4, 1 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v6 +; GISEL-NEXT: v_mov_b32_e32 v6, v5 +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GISEL-NEXT: v_mov_b32_e32 v5, s4 ; GISEL-NEXT: v_mov_b32_e32 v4, s5 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 -; GISEL-NEXT: s_sub_u32 s9, 0, s8 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb ; GISEL-NEXT: s_cselect_b32 s4, 1, 0 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 ; GISEL-NEXT: s_and_b32 s4, s4, 1 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 +; GISEL-NEXT: v_trunc_f32_e32 v8, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 ; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GISEL-NEXT: s_subb_u32 s10, 0, 0 -; GISEL-NEXT: v_mul_lo_u32 v10, s9, v8 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 +; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v13, s10, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, s9, v6 -; GISEL-NEXT: v_mov_b32_e32 v15, s4 -; GISEL-NEXT: v_mul_lo_u32 v16, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v17, s7, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, s6, v7 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v19, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 +; GISEL-NEXT: v_mul_lo_u32 v10, s9, v8 +; GISEL-NEXT: v_mov_b32_e32 v11, s4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GISEL-NEXT: v_mul_lo_u32 v12, s6, v9 +; GISEL-NEXT: v_mul_lo_u32 v13, s9, v7 +; GISEL-NEXT: v_mul_lo_u32 v14, s10, v7 +; GISEL-NEXT: v_mul_hi_u32 v15, s9, v7 +; GISEL-NEXT: v_mul_lo_u32 v16, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v17, s7, v6 +; GISEL-NEXT: v_mul_hi_u32 v18, s6, v6 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 +; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 ; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v16 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 +; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16 ; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 +; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_lo_u32 v15, v7, v10 ; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11 -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v17, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 +; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s10, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, s9, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v16 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 -; GISEL-NEXT: v_mul_hi_u32 v16, s6, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, s9, v7 +; GISEL-NEXT: v_mul_lo_u32 v13, s10, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, s9, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, s6, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, s7, v6 +; GISEL-NEXT: v_mul_hi_u32 v16, s6, v6 ; GISEL-NEXT: v_mul_lo_u32 v17, s9, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v8, v10 -; GISEL-NEXT: v_mul_hi_u32 v19, v6, v10 +; GISEL-NEXT: v_mul_hi_u32 v19, v7, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 ; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v12 -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v18, v13 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v12 +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 +; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v14 +; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15 ; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v16, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11 +; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14 +; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v16 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v12, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_mul_lo_u32 v14, v2, v8 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v6 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6 +; GISEL-NEXT: v_mul_lo_u32 v15, v2, v8 ; GISEL-NEXT: v_mul_lo_u32 v16, v3, v8 ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 ; GISEL-NEXT: v_mul_lo_u32 v18, v0, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v13, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v14, v0, v9 ; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v16, v6 +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v17 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, s8, v6 -; GISEL-NEXT: v_mul_lo_u32 v14, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 ; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 +; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8 ; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v7 ; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v1, v6, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v1, s[6:7], v1, v6 ; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v6, v15, v6, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v2 +; GISEL-NEXT: v_subrev_i32_e32 v6, vcc, s8, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 +; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, s8, v0 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -1334,19 +1334,19 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v10, v19, v10, vcc -; GISEL-NEXT: v_subrev_i32_e32 v13, vcc, s8, v7 +; GISEL-NEXT: v_subrev_i32_e32 v13, vcc, s8, v6 ; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc ; GISEL-NEXT: v_subrev_i32_e32 v12, vcc, s8, v11 ; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, v11, v12, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -236,12 +236,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -297,17 +296,16 @@ ; GFX9-LABEL: s_usubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp @@ -324,15 +322,14 @@ ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -456,24 +453,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 -; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_mov_b32_e32 v4, 24 ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -482,9 +477,9 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -576,27 +571,26 @@ ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp @@ -623,39 +617,37 @@ ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -25,12 +25,11 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v2i16_one_use: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_xor_b32 s0, s0, s1 ; GFX7-NEXT: s_xor_b32 s0, s0, -1 @@ -42,10 +41,10 @@ ; GFX8-NEXT: s_xor_b32 s0, s0, s1 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -117,18 +116,17 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v4i16_one_use: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshl_b32 s2, s5, 16 -; GFX7-NEXT: s_and_b32 s3, s4, s8 +; GFX7-NEXT: s_and_b32 s3, s4, 0xffff ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s3, s7, 16 -; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_and_b32 s4, s6, 0xffff ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_mov_b32 s4, -1 ; GFX7-NEXT: s_mov_b32 s5, s4 @@ -142,16 +140,16 @@ ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_and_b32 s6, s1, s4 +; GFX8-NEXT: s_and_b32 s6, s1, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -150,7 +150,7 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 @@ -232,7 +232,7 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -3240,16 +3240,16 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, s8 +; GFX6-NEXT: s_and_b32 s9, s2, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX6-NEXT: s_lshr_b32 s9, s0, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s2, s2, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 @@ -3263,7 +3263,7 @@ ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: s_lshr_b32 s10, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -3297,19 +3297,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, s8 +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s6, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: s_and_b32 s0, s7, s8 +; GFX9-NEXT: s_and_b32 s0, s7, 0xffff ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -3322,7 +3321,7 @@ ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s0, s5, s8 +; GFX9-NEXT: s_and_b32 s0, s5, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -3356,19 +3355,18 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s8, 0xffff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: s_and_b32 s1, s6, 0xffff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 ; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 -; GFX90A-NEXT: s_and_b32 s4, s4, s8 +; GFX90A-NEXT: s_and_b32 s4, s4, 0xffff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: s_and_b32 s0, s7, s8 +; GFX90A-NEXT: s_and_b32 s0, s7, 0xffff ; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 @@ -3381,7 +3379,7 @@ ; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 -; GFX90A-NEXT: s_and_b32 s0, s5, s8 +; GFX90A-NEXT: s_and_b32 s0, s5, 0xffff ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -3516,9 +3514,9 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s2, s8 +; GFX6-NEXT: s_and_b32 s9, s2, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_and_b32 s10, s0, s8 +; GFX6-NEXT: s_and_b32 s10, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s11, s2, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -3538,10 +3536,10 @@ ; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s8 +; GFX6-NEXT: s_and_b32 s2, s1, 0xffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 @@ -3581,16 +3579,15 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s6, s8 +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_and_b32 s9, s4, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX9-NEXT: s_lshr_b32 s9, s6, 16 +; GFX9-NEXT: s_and_b32 s8, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s8 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -3599,19 +3596,19 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 +; GFX9-NEXT: s_lshr_b32 s9, s7, 16 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s6, s7, s8 +; GFX9-NEXT: s_and_b32 s6, s7, 0xffff ; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX9-NEXT: s_and_b32 s6, s5, s8 +; GFX9-NEXT: s_and_b32 s6, s5, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 ; GFX9-NEXT: s_lshr_b32 s1, s5, 16 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 @@ -3629,9 +3626,9 @@ ; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s9 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 @@ -3648,16 +3645,15 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s8, 0xffff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: s_and_b32 s1, s6, 0xffff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX90A-NEXT: s_and_b32 s9, s4, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX90A-NEXT: s_lshr_b32 s9, s6, 16 +; GFX90A-NEXT: s_and_b32 s8, s4, 0xffff +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX90A-NEXT: s_lshr_b32 s8, s6, 16 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s8 ; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 @@ -3666,24 +3662,24 @@ ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 +; GFX90A-NEXT: s_lshr_b32 s9, s7, 16 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 ; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 -; GFX90A-NEXT: s_and_b32 s4, s7, s8 +; GFX90A-NEXT: s_and_b32 s4, s7, 0xffff ; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_and_b32 s4, s5, s8 +; GFX90A-NEXT: s_and_b32 s4, s5, 0xffff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v1 ; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 @@ -3701,7 +3697,7 @@ ; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc ; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 +; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s9 ; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 ; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 @@ -4923,31 +4919,30 @@ ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s6, s0, s8 +; GFX6-NEXT: s_and_b32 s8, s2, 0xffff +; GFX6-NEXT: s_and_b32 s6, s0, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_and_b32 s6, s2, s8 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GFX6-NEXT: s_lshr_b32 s0, s2, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s8 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX6-NEXT: s_and_b32 s0, s3, s8 +; GFX6-NEXT: s_and_b32 s0, s3, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -4961,7 +4956,7 @@ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -4972,46 +4967,45 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_and_b32 s0, s4, s8 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s1, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX9-NEXT: s_lshr_b32 s1, s6, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v1, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s0, s7, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX9-NEXT: s_and_b32 s0, s7, 0xffff +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GFX9-NEXT: v_mad_f32 v4, -v2, v3, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s8 +; GFX9-NEXT: s_and_b32 s0, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: global_store_short v0, v3, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v3i16: @@ -5019,46 +5013,45 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s8, 0xffff -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s0, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: s_and_b32 s0, s4, s8 +; GFX90A-NEXT: s_and_b32 s0, s4, 0xffff +; GFX90A-NEXT: s_and_b32 s1, s6, 0xffff +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s1 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 -; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX90A-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_mad_f32 v2, -v4, v1, v2 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 -; GFX90A-NEXT: s_and_b32 s0, s7, s8 -; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX90A-NEXT: s_and_b32 s0, s7, 0xffff +; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GFX90A-NEXT: v_mad_f32 v4, -v2, v3, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: s_and_b32 s0, s5, s8 +; GFX90A-NEXT: s_and_b32 s0, s5, 0xffff +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX90A-NEXT: global_store_short v0, v3, s[2:3] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -5145,9 +5138,9 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_and_b32 s6, s0, s8 +; GFX6-NEXT: s_and_b32 s6, s0, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_and_b32 s6, s2, s8 +; GFX6-NEXT: s_and_b32 s6, s2, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 @@ -5165,9 +5158,9 @@ ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: s_and_b32 s0, s1, s8 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX6-NEXT: s_and_b32 s0, s3, s8 +; GFX6-NEXT: s_and_b32 s0, s3, 0xffff ; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 @@ -5197,55 +5190,55 @@ ; ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_and_b32 s6, s2, 0xffff +; GFX9-NEXT: s_and_b32 s7, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s1, s7, s8 -; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 -; GFX9-NEXT: s_and_b32 s5, s5, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 ; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 +; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v3, v0, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_v3i16: @@ -5253,52 +5246,51 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_mov_b32 s8, 0xffff -; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s8 -; GFX90A-NEXT: s_and_b32 s0, s6, s8 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GFX90A-NEXT: s_and_b32 s0, s4, 0xffff +; GFX90A-NEXT: s_and_b32 s1, s6, 0xffff +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 -; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX90A-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v4 +; GFX90A-NEXT: v_mad_f32 v2, -v4, v1, v2 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 ; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_sub_u32_e32 v1, s0, v1 ; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 -; GFX90A-NEXT: s_and_b32 s0, s7, s8 -; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX90A-NEXT: s_and_b32 s0, s7, 0xffff +; GFX90A-NEXT: v_mad_f32 v4, -v2, v3, v5 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX90A-NEXT: v_sub_u32_e32 v0, s1, v0 -; GFX90A-NEXT: s_and_b32 s1, s5, s8 +; GFX90A-NEXT: s_and_b32 s1, s5, 0xffff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s1 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 ; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 ; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 ; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v3 -; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 -; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX90A-NEXT: global_store_short v0, v3, s[2:3] offset:4 +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] ; GFX90A-NEXT: s_endpgm %r = urem <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -5937,18 +5929,18 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s9, s0, s3 +; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff +; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_and_b32 s8, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 @@ -5995,8 +5987,8 @@ ; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: s_and_b32 s1, s6, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f @@ -6051,8 +6043,8 @@ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s0, s4, s8 -; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX90A-NEXT: s_and_b32 s1, s6, 0x7fff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX90A-NEXT: s_bfe_u32 s0, s6, 0xf000f @@ -6182,45 +6174,45 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s10, s0, s3 +; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff +; GFX6-NEXT: s_and_b32 s10, s0, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX6-NEXT: s_and_b32 s9, s2, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 +; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc @@ -6250,9 +6242,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 -; GFX9-NEXT: s_and_b32 s5, s6, s8 +; GFX9-NEXT: s_and_b32 s5, s6, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_and_b32 s0, s4, s8 +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 ; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 @@ -6312,8 +6304,8 @@ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s8 -; GFX90A-NEXT: s_and_b32 s9, s6, s8 +; GFX90A-NEXT: s_and_b32 s1, s4, 0x7fff +; GFX90A-NEXT: s_and_b32 s9, s6, 0x7fff ; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6 @@ -6756,24 +6748,21 @@ ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s11, s0, s3 +; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff +; GFX6-NEXT: s_and_b32 s11, s0, 0x7fff ; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 -; GFX6-NEXT: s_and_b32 s9, s2, s3 ; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_xor_b32 s9, s9, s11 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s9, s9, 30 ; GFX6-NEXT: s_or_b32 s9, s9, 1 +; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s0 @@ -6785,6 +6774,7 @@ ; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_lshr_b32 s8, s2, 15 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 ; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 @@ -6792,11 +6782,13 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: s_xor_b32 s0, s2, s0 ; GFX6-NEXT: s_ashr_i32 s0, s0, 30 -; GFX6-NEXT: s_or_b32 s0, s0, 1 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff +; GFX6-NEXT: s_or_b32 s0, s0, 1 ; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_mov_b32_e32 v6, s0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| @@ -6841,8 +6833,8 @@ ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s8, 0x7fff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s4, s8 -; GFX9-NEXT: s_and_b32 s1, s6, s8 +; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX9-NEXT: s_and_b32 s1, s6, 0x7fff ; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 ; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 @@ -6926,8 +6918,8 @@ ; GFX90A-NEXT: s_movk_i32 s8, 0x7fff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s0, s4, s8 -; GFX90A-NEXT: s_and_b32 s1, s6, s8 +; GFX90A-NEXT: s_and_b32 s0, s4, 0x7fff +; GFX90A-NEXT: s_and_b32 s1, s6, 0x7fff ; GFX90A-NEXT: s_bfe_i32 s1, s1, 0xf0000 ; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX90A-NEXT: s_bfe_i32 s0, s0, 0xf0000 @@ -7350,13 +7342,12 @@ ; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s4, 0x1000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s8, s4, s2 +; GFX6-NEXT: s_lshl_b32 s8, 0x1000, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX6-NEXT: s_lshl_b32 s9, s4, s3 +; GFX6-NEXT: s_lshl_b32 s9, 0x1000, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb @@ -7404,106 +7395,105 @@ ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s4, 0x1000 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, s3 -; GFX9-NEXT: s_lshl_b32 s4, s4, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe -; GFX9-NEXT: s_sub_i32 s3, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s3, 0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: s_sub_i32 s2, 0, s7 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v3 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 -; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x1000 -; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX90A-NEXT: s_sub_i32 s1, 0, s2 +; GFX90A-NEXT: s_lshl_b32 s6, 0x1000, s2 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX90A-NEXT: s_lshl_b32 s7, 0x1000, s3 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_mov_b32 s1, 0x4f7ffffe ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX90A-NEXT: s_sub_i32 s0, 0, s6 +; GFX90A-NEXT: v_mul_f32_e32 v0, s1, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v0 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 -; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_f32_e32 v1, s1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s6 +; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v4, s2, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX90A-NEXT: s_sub_i32 s1, 0, s0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 -; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 +; GFX90A-NEXT: s_sub_i32 s0, 0, s7 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 -; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s7 ; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX90A-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = udiv <2 x i32> %x, %shl.y @@ -7685,12 +7675,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s2, 0xfff ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xfff +; GFX6-NEXT: s_and_b32 s1, s1, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -7700,13 +7689,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s5, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -7714,13 +7702,12 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_movk_i32 s0, 0xfff ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s0 -; GFX90A-NEXT: s_and_b32 s0, s5, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff +; GFX90A-NEXT: s_and_b32 s1, s5, 0xfff +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm %r = urem <2 x i32> %x, @@ -7797,14 +7784,13 @@ ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s4, 0x1000 ; GFX6-NEXT: s_mov_b32 s5, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, s4, s2 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_lshl_b32 s3, s4, s3 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX6-NEXT: s_sub_i32 s4, 0, s2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -7847,21 +7833,20 @@ ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s4, 0x1000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s5, s4, s3 -; GFX9-NEXT: s_lshl_b32 s4, s4, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0x1000, s2 +; GFX9-NEXT: s_lshl_b32 s4, 0x1000, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 ; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe -; GFX9-NEXT: s_sub_i32 s3, 0, s5 +; GFX9-NEXT: s_sub_i32 s3, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s2, 0, s4 +; GFX9-NEXT: s_sub_i32 s2, 0, s5 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c @@ -7874,71 +7859,70 @@ ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s4 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_v2i32_pow2_shl_denom: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x1000 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX90A-NEXT: s_mov_b32 s8, 0x4f7ffffe ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 -; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe -; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 +; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX90A-NEXT: s_lshl_b32 s1, 0x1000, s7 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX90A-NEXT: s_sub_i32 s6, 0, s0 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: s_sub_i32 s1, 0, s2 +; GFX90A-NEXT: s_sub_i32 s7, 0, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, s8, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 +; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v0 ; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX90A-NEXT: v_sub_u32_e32 v0, s6, v0 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 +; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX90A-NEXT: s_sub_i32 s1, 0, s0 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 +; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 -; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 -; GFX90A-NEXT: v_sub_u32_e32 v1, s7, v1 -; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 +; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX90A-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = urem <2 x i32> %x, %shl.y @@ -8423,34 +8407,33 @@ ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s10, 0x1000 ; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, s10, s2 -; GFX6-NEXT: s_ashr_i32 s11, s2, 31 -; GFX6-NEXT: s_add_i32 s2, s2, s11 -; GFX6-NEXT: s_xor_b32 s2, s2, s11 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX6-NEXT: s_ashr_i32 s10, s2, 31 +; GFX6-NEXT: s_add_i32 s2, s2, s10 +; GFX6-NEXT: s_xor_b32 s2, s2, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_lshl_b32 s0, s10, s3 -; GFX6-NEXT: s_sub_i32 s10, 0, s2 +; GFX6-NEXT: s_sub_i32 s11, 0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s3, s0, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_add_i32 s0, s0, s3 ; GFX6-NEXT: s_ashr_i32 s1, s8, 31 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 -; GFX6-NEXT: s_xor_b32 s10, s0, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 +; GFX6-NEXT: v_mul_lo_u32 v1, s11, v0 +; GFX6-NEXT: s_xor_b32 s11, s0, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 ; GFX6-NEXT: s_add_i32 s0, s8, s1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s0, s0, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_xor_b32 s8, s1, s11 +; GFX6-NEXT: s_xor_b32 s8, s1, s10 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 @@ -8462,7 +8445,7 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s10 +; GFX6-NEXT: s_sub_i32 s0, 0, s11 ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: s_add_i32 s1, s9, s0 @@ -8474,17 +8457,17 @@ ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: s_xor_b32 s2, s0, s3 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s11 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s11, v2 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 @@ -8493,96 +8476,94 @@ ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x1000 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s8, s2 -; GFX9-NEXT: s_ashr_i32 s9, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s9 -; GFX9-NEXT: s_xor_b32 s2, s2, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_lshl_b32 s0, s8, s3 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s3, 0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s8, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_sub_i32 s9, 0, s0 ; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s8, 0, s0 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX9-NEXT: s_ashr_i32 s7, s4, 31 +; GFX9-NEXT: s_add_i32 s4, s4, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 -; GFX9-NEXT: s_ashr_i32 s3, s6, 31 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: s_xor_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: s_add_i32 s6, s6, s3 -; GFX9-NEXT: s_xor_b32 s6, s6, s3 -; GFX9-NEXT: s_xor_b32 s3, s3, s9 +; GFX9-NEXT: s_sub_i32 s10, 0, s6 +; GFX9-NEXT: s_ashr_i32 s9, s5, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s9 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: s_ashr_i32 s8, s7, 31 -; GFX9-NEXT: s_xor_b32 s1, s8, s1 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 +; GFX9-NEXT: s_xor_b32 s1, s7, s1 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v4, s6, v4 -; GFX9-NEXT: s_add_i32 s6, s7, s8 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 +; GFX9-NEXT: s_xor_b32 s4, s5, s9 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_mul_hi_u32 v1, s6, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 +; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v4 +; GFX9-NEXT: v_subrev_u32_e32 v5, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: s_xor_b32 s0, s9, s8 +; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x1000 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c -; GFX90A-NEXT: s_mov_b32 s10, 0x4f7ffffe +; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 -; GFX90A-NEXT: s_ashr_i32 s9, s2, 31 -; GFX90A-NEXT: s_add_i32 s2, s2, s9 -; GFX90A-NEXT: s_xor_b32 s2, s2, s9 +; GFX90A-NEXT: s_lshl_b32 s2, 0x1000, s2 +; GFX90A-NEXT: s_ashr_i32 s8, s2, 31 +; GFX90A-NEXT: s_add_i32 s2, s2, s8 +; GFX90A-NEXT: s_xor_b32 s2, s2, s8 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX90A-NEXT: s_ashr_i32 s1, s6, 31 -; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 +; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s3 ; GFX90A-NEXT: s_add_i32 s3, s6, s1 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: s_xor_b32 s6, s1, s9 +; GFX90A-NEXT: s_xor_b32 s6, s1, s8 ; GFX90A-NEXT: s_xor_b32 s1, s3, s1 ; GFX90A-NEXT: s_sub_i32 s3, 0, s2 -; GFX90A-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -8604,7 +8585,7 @@ ; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 ; GFX90A-NEXT: s_add_i32 s3, s7, s2 ; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX90A-NEXT: v_mul_f32_e32 v1, s10, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: s_xor_b32 s1, s2, s1 ; GFX90A-NEXT: s_xor_b32 s2, s3, s2 @@ -8889,19 +8870,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s2, 0xf000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s3, s0, 31 -; GFX6-NEXT: s_lshr_b32 s3, s3, 20 -; GFX6-NEXT: s_add_i32 s3, s0, s3 -; GFX6-NEXT: s_and_b32 s3, s3, s2 -; GFX6-NEXT: s_sub_i32 s0, s0, s3 +; GFX6-NEXT: s_ashr_i32 s2, s0, 31 +; GFX6-NEXT: s_lshr_b32 s2, s2, 20 +; GFX6-NEXT: s_add_i32 s2, s0, s2 ; GFX6-NEXT: s_ashr_i32 s3, s1, 31 -; GFX6-NEXT: s_lshr_b32 s3, s3, 20 -; GFX6-NEXT: s_add_i32 s3, s1, s3 -; GFX6-NEXT: s_and_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s2, s2, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s2, s3, 20 +; GFX6-NEXT: s_add_i32 s2, s1, s2 +; GFX6-NEXT: s_and_b32 s2, s2, 0xfffff000 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 @@ -8912,7 +8892,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s6, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s4, 31 @@ -8921,8 +8900,8 @@ ; GFX9-NEXT: s_lshr_b32 s1, s1, 20 ; GFX9-NEXT: s_add_i32 s0, s4, s0 ; GFX9-NEXT: s_add_i32 s1, s5, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 ; GFX9-NEXT: s_sub_i32 s0, s4, s0 ; GFX9-NEXT: s_sub_i32 s1, s5, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -8934,7 +8913,6 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX90A-NEXT: s_movk_i32 s6, 0xf000 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 @@ -8943,8 +8921,8 @@ ; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 ; GFX90A-NEXT: s_add_i32 s0, s4, s0 ; GFX90A-NEXT: s_add_i32 s1, s5, s1 -; GFX90A-NEXT: s_and_b32 s0, s0, s6 -; GFX90A-NEXT: s_and_b32 s1, s1, s6 +; GFX90A-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX90A-NEXT: s_and_b32 s1, s1, 0xfffff000 ; GFX90A-NEXT: s_sub_i32 s0, s4, s0 ; GFX90A-NEXT: s_sub_i32 s1, s5, s1 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 @@ -9041,66 +9019,65 @@ ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s6, 0x1000 -; GFX6-NEXT: s_mov_b32 s10, 0x4f7ffffe +; GFX6-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, s6, s2 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 ; GFX6-NEXT: s_ashr_i32 s4, s2, 31 ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_xor_b32 s2, s2, s4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_lshl_b32 s3, s6, s3 -; GFX6-NEXT: s_ashr_i32 s6, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s6 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX6-NEXT: s_ashr_i32 s9, s3, 31 +; GFX6-NEXT: s_add_i32 s3, s3, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s9, 0, s2 -; GFX6-NEXT: s_xor_b32 s3, s3, s6 +; GFX6-NEXT: s_sub_i32 s10, 0, s2 +; GFX6-NEXT: s_xor_b32 s3, s3, s9 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 -; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, s11, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_mul_lo_u32 v1, s9, v0 -; GFX6-NEXT: s_sub_i32 s9, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 +; GFX6-NEXT: s_sub_i32 s10, 0, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s8, s0, 31 ; GFX6-NEXT: s_add_i32 s0, s0, s8 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_xor_b32 s0, s0, s8 +; GFX6-NEXT: s_ashr_i32 s9, s1, 31 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s10, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, s11, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s1, 31 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX6-NEXT: s_add_i32 s1, s1, s0 +; GFX6-NEXT: s_add_i32 s0, s1, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: s_xor_b32 s1, s1, s0 +; GFX6-NEXT: s_xor_b32 s0, s0, s9 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -9109,16 +9086,15 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0x1000 -; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe +; GFX9-NEXT: s_mov_b32 s8, 0x4f7ffffe ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, s8, s6 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 ; GFX9-NEXT: s_ashr_i32 s1, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s1 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX9-NEXT: s_lshl_b32 s1, s8, s7 +; GFX9-NEXT: s_lshl_b32 s1, 0x1000, s7 ; GFX9-NEXT: s_ashr_i32 s6, s1, 31 ; GFX9-NEXT: s_add_i32 s1, s1, s6 ; GFX9-NEXT: s_xor_b32 s1, s1, s6 @@ -9126,12 +9102,12 @@ ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX9-NEXT: s_sub_i32 s7, 0, s0 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s4, s4, s6 ; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, s8, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s7, 0, s1 @@ -9173,27 +9149,26 @@ ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0x1000 -; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe +; GFX90A-NEXT: s_mov_b32 s8, 0x4f7ffffe ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b32 s0, s8, s6 +; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s6 ; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 ; GFX90A-NEXT: s_add_i32 s0, s0, s1 ; GFX90A-NEXT: s_xor_b32 s0, s0, s1 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX90A-NEXT: s_lshl_b32 s1, s8, s7 -; GFX90A-NEXT: s_sub_i32 s8, 0, s0 -; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 -; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX90A-NEXT: s_add_i32 s4, s4, s6 -; GFX90A-NEXT: s_xor_b32 s4, s4, s6 +; GFX90A-NEXT: s_lshl_b32 s1, 0x1000, s7 ; GFX90A-NEXT: s_ashr_i32 s7, s1, 31 -; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 -; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: s_add_i32 s1, s1, s7 +; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: s_xor_b32 s1, s1, s7 -; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 +; GFX90A-NEXT: s_sub_i32 s7, 0, s0 +; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 +; GFX90A-NEXT: v_mul_f32_e32 v0, s8, v0 +; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX90A-NEXT: s_add_i32 s4, s4, s6 +; GFX90A-NEXT: s_xor_b32 s4, s4, s6 +; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v0 ; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -9210,7 +9185,7 @@ ; GFX90A-NEXT: s_add_i32 s4, s5, s0 ; GFX90A-NEXT: s_sub_i32 s5, 0, s1 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX90A-NEXT: v_mul_f32_e32 v1, s8, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX90A-NEXT: s_xor_b32 s4, s4, s0 ; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 @@ -9251,36 +9226,35 @@ ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s4, 0xfee0 ; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_movk_i32 s8, 0x11f +; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 -; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9296,7 +9270,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -9305,7 +9279,7 @@ ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9321,13 +9295,13 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v5, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -9373,14 +9347,13 @@ ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xfee0 ; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 @@ -9389,16 +9362,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9412,17 +9385,17 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9437,31 +9410,32 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x11f ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s3, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: s_movk_i32 s3, 0x11e ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 ; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 @@ -9473,11 +9447,11 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, v1, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i64_oddk_denom: @@ -9495,8 +9469,8 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 @@ -9513,7 +9487,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -9535,7 +9509,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -9552,7 +9526,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: s_movk_i32 s2, 0x11f @@ -9564,7 +9538,7 @@ ; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s3 ; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 -; GFX90A-NEXT: v_mov_b32_e32 v6, s2 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x11f ; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 ; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc ; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v5 @@ -9775,9 +9749,9 @@ ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s6, 0xf001 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -9788,23 +9762,22 @@ ; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 ; GFX6-NEXT: s_movk_i32 s0, 0xfff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9818,7 +9791,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -9826,7 +9799,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9842,7 +9815,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 @@ -9884,33 +9857,32 @@ ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xf001 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_movk_i32 s8, 0xfff ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s8, 0xfff ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9923,18 +9895,18 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX9-NEXT: s_movk_i32 s4, 0xffe -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9942,7 +9914,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc @@ -9950,17 +9922,18 @@ ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v5 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc @@ -9970,16 +9943,16 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom: @@ -10014,7 +9987,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc @@ -10035,7 +10008,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc @@ -10051,7 +10024,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -10178,38 +10151,37 @@ ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 ; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s4 ; GFX6-NEXT: s_movk_i32 s4, 0x11f +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_movk_i32 s5, 0x11e -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -10225,7 +10197,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -10233,7 +10205,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -10249,7 +10221,7 @@ ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 @@ -10259,7 +10231,7 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 @@ -10299,35 +10271,34 @@ ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xfee0 ; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s8, 0x11f +; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 -; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 +; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -10340,16 +10311,16 @@ ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -10366,46 +10337,47 @@ ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: s_movk_i32 s6, 0x11e -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v5 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: urem_i64_oddk_denom: @@ -10423,8 +10395,8 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_movk_i32 s8, 0x11f +; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 ; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 @@ -10441,7 +10413,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -10463,7 +10435,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -10480,10 +10452,9 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s9 @@ -10492,7 +10463,7 @@ ; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s9 ; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 -; GFX90A-NEXT: v_mov_b32_e32 v4, s8 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x11f ; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc ; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 @@ -10520,6 +10491,7 @@ ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] @@ -10650,16 +10622,15 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s8, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s2, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xfff +; GFX6-NEXT: s_and_b32 s1, s2, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_mov_b32_e32 v3, v1 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -10667,14 +10638,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -10682,14 +10652,13 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s0, 0xfff ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: v_mov_b32_e32 v3, v1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_and_b32 s1, s4, s0 -; GFX90A-NEXT: s_and_b32 s0, s6, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff +; GFX90A-NEXT: s_and_b32 s1, s6, 0xfff +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 ; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX90A-NEXT: s_endpgm %r = urem <2 x i64> %x, @@ -10798,8 +10767,8 @@ ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -10816,12 +10785,12 @@ ; GFX6-NEXT: s_addc_u32 s3, s3, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 @@ -10829,7 +10798,7 @@ ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -10837,23 +10806,22 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -10870,7 +10838,7 @@ ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 @@ -10915,7 +10883,6 @@ ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -10930,16 +10897,16 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -10952,18 +10919,18 @@ ; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -10974,7 +10941,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc @@ -10983,17 +10950,18 @@ ; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s5 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s2, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v5 ; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc ; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 @@ -11004,10 +10972,10 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc -; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc @@ -11016,7 +10984,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_i64_oddk_denom: @@ -11025,8 +10993,8 @@ ; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 @@ -11048,7 +11016,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -11069,7 +11037,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -11091,7 +11059,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc @@ -11246,48 +11214,47 @@ ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 @@ -11339,7 +11306,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 @@ -11359,111 +11325,112 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: sdiv_i64_pow2_shl_denom: @@ -11510,7 +11477,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -11532,7 +11499,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc @@ -11551,7 +11518,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc @@ -11738,10 +11705,9 @@ ; GFX6-NEXT: s_add_u32 s0, s2, s10 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 @@ -11752,36 +11718,36 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: s_movk_i32 s2, 0xfff ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 @@ -11829,7 +11795,6 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s8, 0xf001 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -11840,51 +11805,51 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 ; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: s_add_u32 s2, s4, s2 ; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 ; GFX9-NEXT: s_add_u32 s6, s6, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_addc_u32 s7, s7, s4 @@ -11892,23 +11857,24 @@ ; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: s_movk_i32 s5, 0xfff ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 @@ -11977,7 +11943,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc @@ -11999,7 +11965,7 @@ ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc ; GFX90A-NEXT: s_addc_u32 s1, s5, 0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 @@ -12021,7 +11987,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -12108,204 +12074,203 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s14, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s14 -; GFX6-NEXT: v_mul_lo_u32 v0, s6, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 ; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_addc_u32 s1, s1, s14 ; GFX6-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s6, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX6-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s6, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s6, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s16, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s16, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s16, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s16, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s16, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s17, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s17, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s17, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s17, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s17, v0 ; GFX6-NEXT: s_add_u32 s8, s8, s12 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s10, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, s10, v1 -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s11 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s11 ; GFX6-NEXT: s_addc_u32 s9, s9, s12 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s17, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s16, v4 -; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s10, v4 -; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s17, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s16, v3 +; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 +; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v6, s[0:1], 2, v1 -; GFX6-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v8, s[0:1], 1, v1 -; GFX6-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v7, s17 -; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s9 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 -; GFX6-NEXT: v_mac_f32_e32 v9, s18, v10 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; GFX6-NEXT: v_rcp_f32_e32 v4, v9 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v4, s19, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX6-NEXT: s_sub_u32 s0, 0, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 -; GFX6-NEXT: s_subb_u32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 -; GFX6-NEXT: s_ashr_i32 s10, s3, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: s_mov_b32 s11, s10 -; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s15, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v6, s17 +; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 +; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v8 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX6-NEXT: s_sub_u32 s0, 0, s8 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 +; GFX6-NEXT: s_subb_u32 s1, 0, s9 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 +; GFX6-NEXT: s_ashr_i32 s10, s3, 31 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 +; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s15, v1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: s_add_u32 s0, s2, s10 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: s_addc_u32 s1, s3, s10 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[10:11] -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s3, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, s3, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s3, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, s15 +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s3, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s3, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, s3, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s3, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, s15 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s8, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s8, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v1 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s9, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s8, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s8, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s9, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s8, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s3, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, s9 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s2, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc ; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s8, v5 @@ -12316,30 +12281,30 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v3 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v3 -; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 +; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s3 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] -; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -12371,77 +12336,77 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s14, s5, 31 ; GFX9-NEXT: s_mov_b32 s15, s14 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_add_u32 s2, s4, s14 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s5, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s10, v1 ; GFX9-NEXT: v_mul_lo_u32 v5, s11, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, s10, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 @@ -12508,7 +12473,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -12531,7 +12496,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: s_add_u32 s6, s6, s10 @@ -12552,7 +12517,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v7, s1 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v4 @@ -12624,16 +12589,16 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_subb_u32 s1, 0, s13 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 +; GFX90A-NEXT: s_mov_b32 s15, s14 ; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_mov_b32 s15, s14 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 ; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 @@ -12650,7 +12615,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc @@ -12672,7 +12637,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -12691,7 +12656,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -12767,7 +12732,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v10, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc @@ -12789,7 +12754,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 @@ -12810,7 +12775,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc @@ -12873,8 +12838,8 @@ ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -12892,42 +12857,41 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -12935,7 +12899,7 @@ ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -12945,7 +12909,7 @@ ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 @@ -12988,7 +12952,6 @@ ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -13003,16 +12966,16 @@ ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -13025,18 +12988,18 @@ ; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -13047,7 +13010,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc @@ -13056,41 +13019,42 @@ ; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s5 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s5, v0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s5, v2 -; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v2 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_i64_oddk_denom: @@ -13099,8 +13063,8 @@ ; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 @@ -13122,7 +13086,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -13143,7 +13107,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -13165,7 +13129,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc @@ -13327,48 +13291,47 @@ ; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s13, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 @@ -13418,7 +13381,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -13443,107 +13405,108 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: srem_i64_pow2_shl_denom: @@ -13590,7 +13553,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc @@ -13612,7 +13575,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc @@ -13631,7 +13594,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc @@ -13698,24 +13661,23 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd -; GFX6-NEXT: s_movk_i32 s8, 0xf000 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s9, s1, 31 -; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s0, s9 -; GFX6-NEXT: s_addc_u32 s10, s1, 0 -; GFX6-NEXT: s_and_b32 s9, s9, s8 -; GFX6-NEXT: s_sub_u32 s0, s0, s9 -; GFX6-NEXT: s_subb_u32 s1, s1, s10 -; GFX6-NEXT: s_ashr_i32 s9, s3, 31 -; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s2, s9 -; GFX6-NEXT: s_addc_u32 s10, s3, 0 -; GFX6-NEXT: s_and_b32 s8, s9, s8 +; GFX6-NEXT: s_ashr_i32 s8, s1, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 +; GFX6-NEXT: s_add_u32 s8, s0, s8 +; GFX6-NEXT: s_addc_u32 s9, s1, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 +; GFX6-NEXT: s_sub_u32 s0, s0, s8 +; GFX6-NEXT: s_subb_u32 s1, s1, s9 +; GFX6-NEXT: s_ashr_i32 s8, s3, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 +; GFX6-NEXT: s_add_u32 s8, s2, s8 +; GFX6-NEXT: s_addc_u32 s9, s3, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 ; GFX6-NEXT: s_sub_u32 s2, s2, s8 -; GFX6-NEXT: s_subb_u32 s3, s3, s10 +; GFX6-NEXT: s_subb_u32 s3, s3, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s2 @@ -13727,21 +13689,20 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s8, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s0, s5, 31 ; GFX9-NEXT: s_lshr_b32 s0, s0, 20 ; GFX9-NEXT: s_add_u32 s0, s4, s0 ; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_and_b32 s0, s0, s8 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s0, s4, s0 ; GFX9-NEXT: s_subb_u32 s1, s5, s1 ; GFX9-NEXT: s_ashr_i32 s4, s7, 31 ; GFX9-NEXT: s_lshr_b32 s4, s4, 20 ; GFX9-NEXT: s_add_u32 s4, s6, s4 ; GFX9-NEXT: s_addc_u32 s5, s7, 0 -; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-NEXT: s_sub_u32 s4, s6, s4 ; GFX9-NEXT: s_subb_u32 s5, s7, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -13755,21 +13716,20 @@ ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX90A-NEXT: s_movk_i32 s8, 0xf000 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 ; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 ; GFX90A-NEXT: s_add_u32 s0, s4, s0 ; GFX90A-NEXT: s_addc_u32 s1, s5, 0 -; GFX90A-NEXT: s_and_b32 s0, s0, s8 +; GFX90A-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX90A-NEXT: s_sub_u32 s0, s4, s0 ; GFX90A-NEXT: s_subb_u32 s1, s5, s1 ; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 ; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 ; GFX90A-NEXT: s_add_u32 s4, s6, s4 ; GFX90A-NEXT: s_addc_u32 s5, s7, 0 -; GFX90A-NEXT: s_and_b32 s4, s4, s8 +; GFX90A-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX90A-NEXT: s_sub_u32 s4, s6, s4 ; GFX90A-NEXT: s_subb_u32 s5, s7, s5 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 @@ -13828,203 +13788,202 @@ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s12, s9, 31 ; GFX6-NEXT: s_add_u32 s0, s8, s12 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_addc_u32 s1, s9, s12 ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s17, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s9, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, s17 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 -; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v1 -; GFX6-NEXT: v_subbrev_u32_e64 v6, s[2:3], 0, v3, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v6 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 +; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 -; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v6 -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX6-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] +; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: s_add_u32 s8, s14, s2 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s9 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, s9 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s9, s15, s2 ; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s9 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v2 -; GFX6-NEXT: v_mac_f32_e32 v7, s18, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v1 -; GFX6-NEXT: v_rcp_f32_e32 v7, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v4, s19, v7 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s9 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_mac_f32_e32 v6, s18, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GFX6-NEXT: v_rcp_f32_e32 v6, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v3, s19, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: s_sub_u32 s0, 0, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 ; GFX6-NEXT: s_subb_u32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 ; GFX6-NEXT: s_ashr_i32 s14, s11, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 ; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v5, s10, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s10, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s11, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, s11, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, s12 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s8, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v1 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s9, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s9 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v3 +; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] @@ -14038,22 +13997,22 @@ ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v7, s11 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s14 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -14085,77 +14044,77 @@ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX9-NEXT: s_add_u32 s2, s4, s8 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_addc_u32 s3, s5, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s15, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s15, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s15, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s15, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_sub_u32_e32 v3, s15, v2 @@ -14223,7 +14182,7 @@ ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 @@ -14246,7 +14205,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -14266,7 +14225,7 @@ ; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 @@ -14340,16 +14299,16 @@ ; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 ; GFX90A-NEXT: s_subb_u32 s1, 0, s13 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 +; GFX90A-NEXT: s_mov_b32 s15, s14 ; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 ; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 ; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX90A-NEXT: s_mov_b32 s15, s14 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 ; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 @@ -14366,7 +14325,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 ; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc @@ -14388,7 +14347,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 ; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -14407,7 +14366,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 ; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc @@ -14484,7 +14443,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 ; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v10, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc @@ -14506,7 +14465,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 ; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 @@ -14527,7 +14486,7 @@ ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 ; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -63,9 +63,8 @@ ; Second use is a VGPR use of the constant. ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0: -; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 -; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687 +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x12d687 ; SI: buffer_store_dword [[VK]] define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 @@ -79,10 +78,9 @@ ; Second use is another SGPR use of the constant. ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1: -; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 -; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687 ; SI: s_add_i32 -; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, 0x12d687 ; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]] ; SI: buffer_store_dword [[VADD]] define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -3073,7 +3073,7 @@ ; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3124,7 +3124,7 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3166,41 +3166,39 @@ ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -3220,7 +3218,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3229,31 +3227,29 @@ ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -3272,7 +3268,7 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3490,7 +3486,7 @@ ; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3541,7 +3537,7 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3583,41 +3579,39 @@ ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 ; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -3637,7 +3631,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3646,31 +3640,29 @@ ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -3689,7 +3681,7 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -27,7 +27,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -42,11 +41,11 @@ ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 @@ -57,13 +56,13 @@ ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 @@ -81,7 +80,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v10, v2 @@ -175,7 +174,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -190,32 +188,32 @@ ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -224,7 +222,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 @@ -318,7 +316,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -333,11 +330,11 @@ ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v13, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v6, v2 @@ -348,13 +345,13 @@ ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v14, v12, v2 +; GFX9-NEXT: v_mul_hi_u32 v13, v12, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 @@ -372,7 +369,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v9, v0 @@ -462,7 +459,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -477,32 +473,32 @@ ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -511,7 +507,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v4 @@ -728,7 +724,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -743,11 +738,11 @@ ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 @@ -758,13 +753,13 @@ ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 @@ -782,7 +777,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2 @@ -896,7 +891,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -911,32 +905,32 @@ ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -945,7 +939,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: s_mul_i32 ; CHECK: s_sub_i32 -; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, 0xffff ; CHECK: s_add_i32 [[S2:s[0-9]+]], {{s[0-9]+}}, [[S1]] ; CHECK: s_or_b32 {{s[0-9]+}}, [[S2]], 0xc0 diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -226,7 +226,6 @@ ; GCN-LABEL: {{^}}load_sampler ; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 ; SI: s_nop ; GCN: s_load_dwordx8 ; GCN-NEXT: s_load_dwordx4 @@ -260,7 +259,6 @@ ; GCN-LABEL: {{^}}load_sampler_nouniform ; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 ; SI: s_nop ; GCN: s_load_dwordx8 ; GCN-NEXT: s_load_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1498,15 +1498,13 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1602,19 +1600,18 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1490,14 +1490,12 @@ ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1593,18 +1591,17 @@ ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -960,7 +960,6 @@ ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_mov_b32 s10, s6 ; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 @@ -975,8 +974,8 @@ ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x900 +; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -993,7 +992,6 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX10-NEXT: s_movk_i32 s0, 0x900 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1004,8 +1002,8 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_add_nc_u16 v1, v1, s0 -; GFX10-NEXT: v_add_nc_u16 v5, v2, s0 +; GFX10-NEXT: v_add_nc_u16 v1, 0x900, v1 +; GFX10-NEXT: v_add_nc_u16 v5, 0x900, v2 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -42,9 +42,8 @@ ; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2 ; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff -; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]] -; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]] +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7fff7fff +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], 0x7fff7fff ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -30,8 +30,8 @@ } ; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: s_and_b32 -; SI: s_and_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) @@ -40,10 +40,10 @@ } ; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: s_and_b32 -; SI: s_and_b32 -; SI: s_and_b32 -; SI: s_and_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -48,8 +48,8 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -62,10 +62,10 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -107,10 +107,9 @@ ; VI-LABEL: v_exp_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3dc5 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3dc5 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v0, s4, v0 +; VI-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 ; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -161,7 +160,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0x3dc5 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5 ; VI-NEXT: v_mul_f16_e32 v2, s4, v1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v4, s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -11,26 +11,24 @@ ; FLAT_SCR_OPT-NEXT: s_addc_u32 s1, s1, 0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v0, 4 -; FLAT_SCR_OPT-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; FLAT_SCR_OPT-NEXT: v_cmp_ne_u32_e64 s0, 4, -1 +; FLAT_SCR_OPT-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_OPT-NEXT: s_lshl_b32 s0, s0, 16 -; FLAT_SCR_OPT-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 -; FLAT_SCR_OPT-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; FLAT_SCR_OPT-NEXT: v_cndmask_b32_e64 v1, 0, s0, vcc_lo +; FLAT_SCR_OPT-NEXT: s_lshl_b32 s1, s1, 16 +; FLAT_SCR_OPT-NEXT: v_cndmask_b32_e64 v0, 0, 4, s0 +; FLAT_SCR_OPT-NEXT: v_cndmask_b32_e64 v1, 0, s1, s0 ; FLAT_SCR_OPT-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_OPT-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: stack_object_addrspacecast_in_kernel_no_calls: ; FLAT_SCR_ARCH: ; %bb.0: -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v0, 4 -; FLAT_SCR_ARCH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; FLAT_SCR_ARCH-NEXT: v_cmp_ne_u32_e64 s0, 4, -1 +; FLAT_SCR_ARCH-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_ARCH-NEXT: s_lshl_b32 s0, s0, 16 -; FLAT_SCR_ARCH-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v0 -; FLAT_SCR_ARCH-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; FLAT_SCR_ARCH-NEXT: v_cndmask_b32_e64 v1, 0, s0, vcc_lo +; FLAT_SCR_ARCH-NEXT: s_lshl_b32 s1, s1, 16 +; FLAT_SCR_ARCH-NEXT: v_cndmask_b32_e64 v0, 0, 4, s0 +; FLAT_SCR_ARCH-NEXT: v_cndmask_b32_e64 v1, 0, s1, s0 ; FLAT_SCR_ARCH-NEXT: flat_store_dword v[0:1], v2 ; FLAT_SCR_ARCH-NEXT: s_waitcnt_vscnt null, 0x0 ; FLAT_SCR_ARCH-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -428,12 +428,11 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -470,12 +469,11 @@ ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -514,12 +512,11 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, s32 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -543,12 +540,11 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1137,14 +1133,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1184,15 +1179,14 @@ ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1209,14 +1203,13 @@ ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1261,15 +1254,15 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1296,15 +1289,15 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1909,14 +1902,13 @@ ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1956,15 +1948,14 @@ ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1981,14 +1972,13 @@ ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2033,15 +2023,15 @@ ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2068,15 +2058,15 @@ ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -970,7 +970,7 @@ ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. ; GCN-LABEL: {{^}}one_non_inline_constant: -; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 +; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]] define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { @@ -990,9 +990,8 @@ } ; GCN-LABEL: {{^}}two_non_inline_constant_multi_use: -; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 ; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000 -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]] +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]] define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -97,9 +97,8 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -69,10 +69,9 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v2f64: -; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs @@ -81,12 +80,11 @@ } ; GCN-LABEL: {{^}}fneg_fabs_v4f64: -; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -84,9 +84,8 @@ ; R600: -PV ; FIXME: In this case two uses of the constant should be folded -; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -95,11 +94,10 @@ } ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -18,9 +18,8 @@ ; R600: -PV ; R600: -PV -; GCN: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1 -; GCN: s_xor_b32 -; GCN: s_xor_b32 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { %fneg = fsub <2 x float> , %in store <2 x float> %fneg, <2 x float> addrspace(1)* %out @@ -33,10 +32,10 @@ ; R600: -PV ; R600: -PV -; GCN: s_xor_b32 -; GCN: s_xor_b32 -; GCN: s_xor_b32 -; GCN: s_xor_b32 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { %fneg = fsub <4 x float> , %in store <4 x float> %fneg, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir --- a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir @@ -36,9 +36,10 @@ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF]], 0, implicit $exec - ; GCN: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF1]], 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_1]], implicit [[V_ADD_CO_U32_e64_2]] + ; GCN: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc + ; GCN: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF1]], implicit-def $vcc, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ADD_CO_U32_e32_1]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1344,16 +1344,14 @@ ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v6, v5 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1364,14 +1362,14 @@ ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5 ; SI-NEXT: v_mul_f32_e32 v6, v2, v5 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1398,8 +1396,6 @@ ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1411,14 +1407,14 @@ ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v6, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1429,14 +1425,14 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5 ; CI-NEXT: v_mul_f32_e32 v6, v2, v5 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6 ; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1595,16 +1591,14 @@ ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 ; SI-NEXT: v_rcp_f32_e32 v10, v9 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1615,14 +1609,14 @@ ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 ; SI-NEXT: v_rcp_f32_e32 v9, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9 ; SI-NEXT: v_mul_f32_e32 v10, v5, v9 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1632,14 +1626,14 @@ ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 ; SI-NEXT: v_rcp_f32_e32 v7, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7 ; SI-NEXT: v_mul_f32_e32 v8, v4, v7 ; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1649,14 +1643,14 @@ ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v7, v5, v5 ; SI-NEXT: v_mul_f32_e32 v7, v3, v5 ; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 ; SI-NEXT: v_fma_f32 v7, v8, v5, v7 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -1682,8 +1676,6 @@ ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1702,14 +1694,14 @@ ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 ; CI-NEXT: v_rcp_f32_e32 v10, v9 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; CI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1720,14 +1712,14 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_rcp_f32_e32 v9, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9 ; CI-NEXT: v_mul_f32_e32 v10, v5, v9 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10 ; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; CI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1737,14 +1729,14 @@ ; CI-NEXT: v_or_b32_e32 v1, v4, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 ; CI-NEXT: v_rcp_f32_e32 v7, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7 ; CI-NEXT: v_mul_f32_e32 v8, v4, v7 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8 ; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1754,14 +1746,14 @@ ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v7, v5, v5 ; CI-NEXT: v_mul_f32_e32 v7, v3, v5 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 ; CI-NEXT: v_fma_f32 v7, v8, v5, v7 ; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v3, v3 @@ -1965,16 +1957,14 @@ ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v6, v5 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1982,14 +1972,14 @@ ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5 ; SI-NEXT: v_mul_f32_e32 v6, v3, v5 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2014,20 +2004,18 @@ ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; CI-NEXT: v_rcp_f32_e32 v6, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -2035,14 +2023,14 @@ ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5 ; CI-NEXT: v_mul_f32_e32 v6, v3, v5 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6 ; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2054,8 +2042,6 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2071,14 +2057,14 @@ ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 ; VI-NEXT: v_rcp_f32_e32 v8, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; VI-NEXT: v_fma_f32 v8, v9, v8, v8 ; VI-NEXT: v_mul_f32_e32 v9, v6, v8 ; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; VI-NEXT: v_fma_f32 v9, v10, v8, v9 ; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 ; VI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2086,14 +2072,14 @@ ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 ; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; VI-NEXT: v_fma_f32 v7, v8, v7, v7 ; VI-NEXT: v_mul_f32_e32 v8, v5, v7 ; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; VI-NEXT: v_fma_f32 v8, v9, v7, v8 ; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 ; VI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2109,20 +2095,18 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 ; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 ; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 ; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -2130,14 +2114,14 @@ ; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 ; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -2219,16 +2203,14 @@ ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; SI-NEXT: v_rcp_f32_e32 v10, v9 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -2236,14 +2218,14 @@ ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v9, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9 ; SI-NEXT: v_mul_f32_e32 v10, v7, v9 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2251,14 +2233,14 @@ ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; SI-NEXT: v_rcp_f32_e32 v8, v7 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; SI-NEXT: v_fma_f32 v8, v9, v8, v8 ; SI-NEXT: v_mul_f32_e32 v9, v6, v8 ; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; SI-NEXT: v_fma_f32 v9, v10, v8, v9 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; SI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2266,14 +2248,14 @@ ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; SI-NEXT: v_rcp_f32_e32 v7, v6 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7 ; SI-NEXT: v_mul_f32_e32 v8, v5, v7 ; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2298,20 +2280,18 @@ ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; CI-NEXT: v_rcp_f32_e32 v10, v9 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; CI-NEXT: v_trunc_f32_e32 v8, v8 @@ -2319,14 +2299,14 @@ ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; CI-NEXT: v_rcp_f32_e32 v9, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9 ; CI-NEXT: v_mul_f32_e32 v10, v7, v9 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10 ; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2334,14 +2314,14 @@ ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; CI-NEXT: v_rcp_f32_e32 v8, v7 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; CI-NEXT: v_fma_f32 v8, v9, v8, v8 ; CI-NEXT: v_mul_f32_e32 v9, v6, v8 ; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; CI-NEXT: v_fma_f32 v9, v10, v8, v9 ; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; CI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2349,14 +2329,14 @@ ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; CI-NEXT: v_rcp_f32_e32 v7, v6 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7 ; CI-NEXT: v_mul_f32_e32 v8, v5, v7 ; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8 ; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; CI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2368,8 +2348,6 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -2385,14 +2363,14 @@ ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 ; VI-NEXT: v_rcp_f32_e32 v12, v11 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 ; VI-NEXT: v_fma_f32 v12, v13, v12, v12 ; VI-NEXT: v_mul_f32_e32 v13, v10, v12 ; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 ; VI-NEXT: v_fma_f32 v13, v14, v12, v13 ; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 ; VI-NEXT: v_trunc_f32_e32 v10, v10 @@ -2400,14 +2378,14 @@ ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; VI-NEXT: v_rcp_f32_e32 v11, v10 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; VI-NEXT: v_fma_f32 v11, v12, v11, v11 ; VI-NEXT: v_mul_f32_e32 v12, v7, v11 ; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 ; VI-NEXT: v_fma_f32 v12, v13, v11, v12 ; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; VI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2415,14 +2393,14 @@ ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; VI-NEXT: v_rcp_f32_e32 v10, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 ; VI-NEXT: v_fma_f32 v10, v11, v10, v10 ; VI-NEXT: v_mul_f32_e32 v11, v6, v10 ; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 ; VI-NEXT: v_fma_f32 v11, v12, v10, v11 ; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; VI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2430,14 +2408,14 @@ ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 ; VI-NEXT: v_fma_f32 v7, v10, v7, v7 ; VI-NEXT: v_mul_f32_e32 v10, v5, v7 ; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 ; VI-NEXT: v_fma_f32 v10, v11, v7, v10 ; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; VI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2453,20 +2431,18 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 ; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 ; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 ; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 @@ -2474,14 +2450,14 @@ ; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 ; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 ; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 ; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 @@ -2489,14 +2465,14 @@ ; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 ; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 ; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 ; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 ; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 ; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 @@ -2504,14 +2480,14 @@ ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 ; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 ; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 ; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 ; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -2636,16 +2612,15 @@ ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 -; SI-NEXT: s_movk_i32 s8, 0xfc01 -; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 ; SI-NEXT: v_not_b32_e32 v10, v10 ; SI-NEXT: v_and_b32_e32 v10, v8, v10 ; SI-NEXT: v_not_b32_e32 v11, v11 ; SI-NEXT: v_and_b32_e32 v11, v9, v11 -; SI-NEXT: s_brev_b32 s9, 1 -; SI-NEXT: v_and_b32_e32 v13, s9, v9 +; SI-NEXT: s_brev_b32 s8, 1 +; SI-NEXT: v_and_b32_e32 v13, s8, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 @@ -2669,13 +2644,13 @@ ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 -; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8 ; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 ; SI-NEXT: v_not_b32_e32 v8, v8 ; SI-NEXT: v_and_b32_e32 v8, v6, v8 ; SI-NEXT: v_not_b32_e32 v9, v9 ; SI-NEXT: v_and_b32_e32 v9, v7, v9 -; SI-NEXT: v_and_b32_e32 v11, s9, v7 +; SI-NEXT: v_and_b32_e32 v11, s8, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -732,10 +732,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -945,37 +944,36 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v12, -1, v5 -; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 ; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9 +; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, v12, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v8 +; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8 +; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13 +; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret @@ -1244,10 +1242,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v4 +; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_mul_hi_u32 v6, v4, s4 -; GFX10-NEXT: v_mul_hi_u32 v7, v5, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -406,18 +406,17 @@ ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -429,7 +428,7 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] @@ -443,16 +442,15 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s1, s4 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] @@ -492,17 +490,16 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s8, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX9-NEXT: s_movk_i32 s7, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 @@ -512,9 +509,9 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 ; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] @@ -529,16 +526,15 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s1, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] @@ -551,7 +547,7 @@ ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 ; GFX10-NEXT: global_store_short v[5:6], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2104,16 +2104,15 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -213,6 +213,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -226,7 +227,6 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_and_b32_e32 v6, s0, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 @@ -315,7 +315,7 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -323,17 +323,17 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 @@ -1205,16 +1205,15 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s3, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1509,7 +1508,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1610,7 +1609,7 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1622,16 +1621,16 @@ ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] @@ -1799,18 +1798,18 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 @@ -1899,7 +1898,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -2012,7 +2011,7 @@ ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2020,27 +2019,27 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2515,7 +2515,6 @@ ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -2529,66 +2528,66 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, 15, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v4, v1, 24, 4 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v6, v1, 16, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v18, v2, 8, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 20, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v15, v2, 4, 4 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v17, v2, 8, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v11, v2, 24, 4 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v10 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v10, v15, 16, v12 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX10-DL-XNACK-NEXT: v_bfe_u32 v16, v2, 20, 4 ; GFX10-DL-XNACK-NEXT: v_bfe_u32 v2, v2, 12, 4 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v18 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v17 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v2, 16, v13 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v10, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v7, 16, v6 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v2, 16, v12 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v15 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v7 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v16, 16, v9 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5 -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v12 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v4, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v8, v2 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v9 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v13, 16, v7 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v6 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v7 ; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] -; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v5 @@ -2604,7 +2603,6 @@ ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -2617,66 +2615,66 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, 15, v1 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, 15, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, 15, v0 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 20, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 15, v0 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v4, v1, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v6, v1, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v7, v1, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v9, v1, 12, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v1, v1, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 4, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v18, v0, 8, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v12, v0, 24, 4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v16, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 16, 4 -; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 20, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v17, v0, 8, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v1, 16, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v10, v15, 16, v12 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v14, v0, 16, 4 +; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v16, v0, 20, 4 ; GFX10-DL-NOXNACK-NEXT: v_bfe_u32 v0, v0, 12, 4 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v18 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v17 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v9, v10, 16, v9 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v8, 16, v7 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v0, 16, v13 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v10, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v9, 16, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v7, 16, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v0, 16, v12 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v15 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v10 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v14 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v8, v17, 16, v10 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v16, 16, v9 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5 -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v12 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v4, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v8, v0 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v13, 16, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0 ; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v6 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v7 ; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] -; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2355,7 +2355,6 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2369,49 +2368,49 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX10-DL-NEXT: v_lshl_or_b32 v7, v9, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 8, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 16, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_lshl_or_b32 v6, v8, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v11 ; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 ; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v9 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v11 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v10 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v8, 16, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v7, 16, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -341,12 +341,11 @@ ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x09,0x00,0x08] ; GFX9: buffer_store_dword [[REG]] -; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0x6400, v{{[0-9]+}} ; gfx8 does not support sreg or imm in sdwa - this will be move then -; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x6400 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -236,9 +236,8 @@ ; GCN-NOT: buffer_ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 -; GCN: s_and_b32 s3, s1, [[K]] -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN: s_and_b32 s3, s1, 0x1010101 +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1010101 ; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1072,9 +1072,8 @@ ; SI-NEXT: s_lshl_b32 s8, s4, 4 ; SI-NEXT: s_mov_b64 s[4:5], 0xffff ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 -; SI-NEXT: s_mov_b32 s8, 0x50005 -; SI-NEXT: s_and_b32 s9, s5, s8 -; SI-NEXT: s_and_b32 s8, s4, s8 +; SI-NEXT: s_and_b32 s9, s5, 0x50005 +; SI-NEXT: s_and_b32 s8, s4, 0x50005 ; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s5 @@ -1247,10 +1246,9 @@ ; SI-NEXT: s_lshl_b32 s8, s6, 3 ; SI-NEXT: s_mov_b64 s[6:7], 0xffff ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; SI-NEXT: s_mov_b32 s8, 0x5050505 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_and_b32 s9, s7, s8 -; SI-NEXT: s_and_b32 s8, s6, s8 +; SI-NEXT: s_and_b32 s9, s7, 0x5050505 +; SI-NEXT: s_and_b32 s8, s6, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] @@ -1271,10 +1269,9 @@ ; VI-NEXT: s_lshl_b32 s8, s6, 3 ; VI-NEXT: s_mov_b64 s[6:7], 0xffff ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; VI-NEXT: s_mov_b32 s8, 0x5050505 ; VI-NEXT: s_mov_b32 s1, s9 -; VI-NEXT: s_and_b32 s9, s7, s8 -; VI-NEXT: s_and_b32 s8, s6, s8 +; VI-NEXT: s_and_b32 s9, s7, 0x5050505 +; VI-NEXT: s_and_b32 s8, s6, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1608,7 +1608,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_and_b32 s1, s4, s2 +; VI-NEXT: s_and_b32 s1, s4, 0xffff ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b32 s0, s1, 16 ; VI-NEXT: s_or_b32 s0, s1, s0 @@ -1636,7 +1636,7 @@ ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 -; CI-NEXT: s_and_b32 s4, s4, s2 +; CI-NEXT: s_and_b32 s4, s4, 0xffff ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_or_b32 s0, s4, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1693,7 +1693,7 @@ ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s1, s5, 4 -; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 ; VI-NEXT: s_lshl_b32 s2, s4, 16 @@ -1719,7 +1719,7 @@ ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s6, s4, s2 +; CI-NEXT: s_and_b32 s6, s4, 0xffff ; CI-NEXT: s_lshl_b32 s1, s5, 4 ; CI-NEXT: s_lshl_b32 s4, s4, 16 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -37,11 +37,10 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -566,10 +566,9 @@ ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -602,12 +601,11 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v2, v6 -; GFX10-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 @@ -653,10 +651,9 @@ ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -705,10 +702,9 @@ ; ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -759,10 +755,9 @@ ; ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -807,10 +802,9 @@ ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -857,10 +851,9 @@ ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -909,10 +902,9 @@ ; ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -963,10 +955,9 @@ ; ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -1160,15 +1151,14 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1197,15 +1187,14 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -62,11 +62,10 @@ ; ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -152,11 +151,10 @@ ; ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -203,11 +201,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -333,11 +330,10 @@ ; ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -384,11 +380,10 @@ ; ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -415,11 +410,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -446,11 +440,10 @@ ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -490,10 +483,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -505,9 +497,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -516,11 +507,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -534,10 +524,9 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -547,14 +536,13 @@ ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v0, v11, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v11, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -572,10 +560,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -587,9 +574,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -598,11 +584,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -620,10 +605,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -635,9 +619,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -646,11 +629,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -668,10 +650,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -685,11 +666,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -699,11 +679,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -721,10 +700,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -736,9 +714,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -747,11 +724,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -769,10 +745,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -784,9 +759,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -795,11 +769,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -817,10 +790,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -832,9 +804,8 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -843,11 +814,10 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -865,10 +835,9 @@ ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -882,11 +851,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -896,11 +864,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -913,14 +880,13 @@ ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -932,11 +898,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -949,14 +914,13 @@ ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -968,11 +932,10 @@ ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -15,9 +15,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] @@ -33,10 +32,9 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] @@ -60,9 +58,8 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -87,9 +84,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] @@ -116,11 +112,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -143,9 +138,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] @@ -170,9 +164,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -197,9 +190,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] @@ -226,11 +218,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -243,14 +234,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -263,14 +253,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -15,9 +15,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -33,10 +32,9 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -60,9 +58,8 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -87,9 +84,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -116,11 +112,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -143,9 +138,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -170,9 +164,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -197,9 +190,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -226,11 +218,10 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -243,14 +234,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -263,14 +253,13 @@ ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -56,11 +55,10 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -35,10 +35,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -60,11 +59,10 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -56,11 +55,10 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -35,10 +35,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -59,11 +58,10 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -31,10 +31,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] @@ -53,11 +52,10 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -33,8 +33,8 @@ ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 -; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c -; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] +; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] @@ -49,7 +49,8 @@ ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -33,8 +33,8 @@ ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a -; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 -; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] +; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] @@ -48,8 +48,9 @@ ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]] ; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -147,21 +147,20 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s7, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s14, s0, s7 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14 -; SI-NEXT: s_brev_b32 s15, 1 +; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s15 -; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: s_and_b32 s0, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s14, 51 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -172,23 +171,22 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s7, s0, s7 -; SI-NEXT: s_brev_b32 s10, -2 -; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01 +; SI-NEXT: s_brev_b32 s7, -2 ; SI-NEXT: v_mov_b32_e32 v4, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 -; SI-NEXT: v_bfi_b32 v4, s10, v6, v4 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 +; SI-NEXT: v_bfi_b32 v4, s7, v6, v4 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s15 +; SI-NEXT: s_and_b32 s0, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -200,7 +198,7 @@ ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v6, s10, v6, v7 +; SI-NEXT: v_bfi_b32 v6, s7, v6, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] @@ -246,21 +244,19 @@ ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 -; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 -; SI-NEXT: s_brev_b32 s20, 1 +; SI-NEXT: s_add_i32 s18, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s18 ; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] -; SI-NEXT: s_and_b32 s0, s7, s20 -; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_and_b32 s0, s7, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s18, 0 ; SI-NEXT: v_mov_b32_e32 v0, s17 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cmp_gt_i32 s18, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -271,7 +267,7 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 -; SI-NEXT: s_add_i32 s17, s0, s18 +; SI-NEXT: s_add_i32 s17, s0, 0xfffffc01 ; SI-NEXT: s_brev_b32 s16, -2 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 @@ -279,7 +275,7 @@ ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 ; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] -; SI-NEXT: s_and_b32 s0, s5, s20 +; SI-NEXT: s_and_b32 s0, s5, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s17, 0 @@ -298,12 +294,12 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] -; SI-NEXT: s_add_i32 s6, s0, s18 +; SI-NEXT: s_add_i32 s6, s0, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s20 +; SI-NEXT: s_and_b32 s0, s11, 0x80000000 ; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc @@ -321,13 +317,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s4, s0, s18 +; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v10, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: s_and_b32 s0, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 0 @@ -412,21 +408,20 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 ; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_movk_i32 s28, 0xfc01 ; SI-NEXT: s_mov_b32 s21, 0xfffff ; SI-NEXT: s_mov_b32 s20, s22 +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 -; SI-NEXT: s_add_i32 s23, s2, s28 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s23 -; SI-NEXT: s_brev_b32 s29, 1 +; SI-NEXT: s_add_i32 s26, s2, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 +; SI-NEXT: s_and_b32 s23, s7, 0x80000000 ; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] -; SI-NEXT: s_and_b32 s2, s7, s29 -; SI-NEXT: s_cmp_lt_i32 s23, 0 +; SI-NEXT: s_cmp_lt_i32 s26, 0 ; SI-NEXT: v_mov_b32_e32 v0, s25 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s23, 51 +; SI-NEXT: s_cmp_gt_i32 s26, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -437,15 +432,14 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s24, s2, s28 +; SI-NEXT: s_add_i32 s24, s2, 0xfffffc01 ; SI-NEXT: s_brev_b32 s23, -2 -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 ; SI-NEXT: v_bfi_b32 v4, s23, v8, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s2, s5, s29 +; SI-NEXT: s_and_b32 s2, s5, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s24, 0 @@ -464,13 +458,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, s28 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_bfi_b32 v6, s23, v8, v6 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, s29 +; SI-NEXT: s_and_b32 s2, s11, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 @@ -489,13 +483,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, s28 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v9, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_bfi_b32 v9, s23, v8, v9 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, s29 +; SI-NEXT: s_and_b32 s2, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 @@ -514,12 +508,12 @@ ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] ; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 ; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] -; SI-NEXT: s_add_i32 s4, s2, s28 +; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 ; SI-NEXT: v_mov_b32_e32 v11, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 ; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, s29 +; SI-NEXT: s_and_b32 s2, s15, 0x80000000 ; SI-NEXT: v_bfi_b32 v11, s23, v8, v11 ; SI-NEXT: s_cmp_lt_i32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc @@ -530,10 +524,10 @@ ; SI-NEXT: v_mov_b32_e32 v10, s2 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, s28 +; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6 ; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5] -; SI-NEXT: s_and_b32 s4, s13, s29 +; SI-NEXT: s_and_b32 s4, s13, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc @@ -542,30 +536,30 @@ ; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 -; SI-NEXT: s_add_i32 s25, s8, s28 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s25 -; SI-NEXT: s_andn2_b64 s[10:11], s[18:19], s[8:9] -; SI-NEXT: s_and_b32 s8, s19, s29 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 +; SI-NEXT: s_andn2_b64 s[28:29], s[18:19], s[8:9] +; SI-NEXT: s_and_b32 s8, s19, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: s_cmp_lt_i32 s25, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5] -; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s25, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] ; SI-NEXT: v_mov_b32_e32 v10, s19 -; SI-NEXT: v_mov_b32_e32 v11, s10 ; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11] -; SI-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[8:9] +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[8:9] ; SI-NEXT: v_mov_b32_e32 v11, s18 ; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11] -; SI-NEXT: s_add_i32 s10, s8, s28 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 ; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] -; SI-NEXT: s_and_b32 s8, s17, s29 +; SI-NEXT: s_and_b32 s8, s17, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v11, s21 ; SI-NEXT: v_mov_b32_e32 v12, s8 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1033,31 +1033,29 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s6 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s2, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s6 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s6 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 @@ -1067,7 +1065,6 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1075,9 +1072,9 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 @@ -1210,12 +1207,11 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 @@ -1227,7 +1223,6 @@ ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1235,8 +1230,8 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s2, s1, 16 ; GCN-HSA-NEXT: s_lshr_b32 s3, s0, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s6 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s6 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 @@ -1247,7 +1242,6 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1256,9 +1250,9 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 @@ -1406,44 +1400,42 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s8, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s9, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s3, s6, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s8 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1455,9 +1447,9 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -1465,7 +1457,6 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1473,24 +1464,24 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -1674,72 +1665,70 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s12 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 @@ -1755,23 +1744,23 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -1779,7 +1768,6 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1787,43 +1775,43 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -2111,132 +2099,130 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s1, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s0, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s3, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s2, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s9, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s8, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s11, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s10, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s13, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s12, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s15, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s14, s18 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s20, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s21, s5, s20 -; GCN-HSA-NEXT: s_and_b32 s22, s4, s20 -; GCN-HSA-NEXT: s_and_b32 s23, s7, s20 -; GCN-HSA-NEXT: s_and_b32 s24, s6, s20 -; GCN-HSA-NEXT: s_and_b32 s25, s9, s20 -; GCN-HSA-NEXT: s_and_b32 s26, s8, s20 -; GCN-HSA-NEXT: s_and_b32 s27, s11, s20 -; GCN-HSA-NEXT: s_and_b32 s28, s10, s20 -; GCN-HSA-NEXT: s_and_b32 s29, s13, s20 -; GCN-HSA-NEXT: s_and_b32 s30, s12, s20 -; GCN-HSA-NEXT: s_and_b32 s31, s15, s20 -; GCN-HSA-NEXT: s_and_b32 s33, s14, s20 -; GCN-HSA-NEXT: s_and_b32 s34, s17, s20 -; GCN-HSA-NEXT: s_and_b32 s35, s16, s20 -; GCN-HSA-NEXT: s_and_b32 s36, s19, s20 -; GCN-HSA-NEXT: s_and_b32 s20, s18, s20 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s18, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 @@ -2246,154 +2232,153 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s19, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s18, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s16, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s34 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s12, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v32i16_to_v32i32: @@ -2900,101 +2885,100 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s1, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s0, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s3, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s55, s2, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s56, s37, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s57, s36, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s58, s39, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s59, s38, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s60, s41, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s61, s40, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s62, s43, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s63, s42, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s64, s45, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s65, s44, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s66, s47, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s67, s46, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s68, s49, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s69, s48, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s70, s51, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s50, s20 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s37, s37, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s36, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s39, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s38, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s41, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s40, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s42, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s45, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s44, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s47, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s46, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s49, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s48, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s51, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s50, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s43, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s37, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s36, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s39, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s38, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s41, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s40, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s43, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s42, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s45, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s44, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s47, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s46, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s49, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s48, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s51, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s50, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s37, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s36, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s39, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s38, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s40, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s43, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s42, s42, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s45, s45, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s44, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s47, s47, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s46, s46, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s49, s49, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s48, s48, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s51, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s50, s50, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s41, s41, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 @@ -3002,63 +2986,63 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3066,7 +3050,6 @@ ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s53, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 @@ -3087,54 +3070,54 @@ ; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s52, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s53 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s53 -; GCN-HSA-NEXT: s_and_b32 s3, s3, s53 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s53 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s53 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s53 -; GCN-HSA-NEXT: s_and_b32 s54, s7, s53 -; GCN-HSA-NEXT: s_and_b32 s55, s6, s53 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s53 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s53 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s53 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s53 -; GCN-HSA-NEXT: s_and_b32 s13, s13, s53 -; GCN-HSA-NEXT: s_and_b32 s12, s12, s53 -; GCN-HSA-NEXT: s_and_b32 s15, s15, s53 -; GCN-HSA-NEXT: s_and_b32 s14, s14, s53 -; GCN-HSA-NEXT: s_and_b32 s18, s37, s53 -; GCN-HSA-NEXT: s_and_b32 s19, s36, s53 -; GCN-HSA-NEXT: s_and_b32 s56, s39, s53 -; GCN-HSA-NEXT: s_and_b32 s57, s38, s53 -; GCN-HSA-NEXT: s_and_b32 s58, s41, s53 -; GCN-HSA-NEXT: s_and_b32 s59, s40, s53 -; GCN-HSA-NEXT: s_and_b32 s60, s43, s53 -; GCN-HSA-NEXT: s_and_b32 s61, s42, s53 -; GCN-HSA-NEXT: s_and_b32 s62, s45, s53 -; GCN-HSA-NEXT: s_and_b32 s63, s44, s53 -; GCN-HSA-NEXT: s_and_b32 s64, s47, s53 -; GCN-HSA-NEXT: s_and_b32 s65, s46, s53 -; GCN-HSA-NEXT: s_and_b32 s66, s49, s53 -; GCN-HSA-NEXT: s_and_b32 s67, s48, s53 -; GCN-HSA-NEXT: s_and_b32 s68, s51, s53 -; GCN-HSA-NEXT: s_and_b32 s53, s50, s53 -; GCN-HSA-NEXT: s_lshr_b32 s37, s37, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s36, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s39, 16 -; GCN-HSA-NEXT: s_lshr_b32 s38, s38, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s43, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s42, 16 -; GCN-HSA-NEXT: s_lshr_b32 s45, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s44, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s47, s47, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s46, 16 -; GCN-HSA-NEXT: s_lshr_b32 s49, s49, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s48, 16 -; GCN-HSA-NEXT: s_lshr_b32 s51, s51, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s50, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s53, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s54, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s18, s37, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s36, 16 +; GCN-HSA-NEXT: s_lshr_b32 s55, s39, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s38, 16 +; GCN-HSA-NEXT: s_lshr_b32 s57, s41, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s40, 16 +; GCN-HSA-NEXT: s_lshr_b32 s59, s43, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s42, 16 +; GCN-HSA-NEXT: s_lshr_b32 s61, s45, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s44, 16 +; GCN-HSA-NEXT: s_lshr_b32 s63, s47, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s46, 16 +; GCN-HSA-NEXT: s_lshr_b32 s65, s49, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 16 +; GCN-HSA-NEXT: s_lshr_b32 s67, s51, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s50, 16 +; GCN-HSA-NEXT: s_and_b32 s37, s37, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s36, 0xffff +; GCN-HSA-NEXT: s_and_b32 s39, s39, 0xffff +; GCN-HSA-NEXT: s_and_b32 s38, s38, 0xffff +; GCN-HSA-NEXT: s_and_b32 s41, s41, 0xffff +; GCN-HSA-NEXT: s_and_b32 s40, s40, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s43, 0xffff +; GCN-HSA-NEXT: s_and_b32 s42, s42, 0xffff +; GCN-HSA-NEXT: s_and_b32 s45, s45, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s44, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s47, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s46, 0xffff +; GCN-HSA-NEXT: s_and_b32 s49, s49, 0xffff +; GCN-HSA-NEXT: s_and_b32 s48, s48, 0xffff +; GCN-HSA-NEXT: s_and_b32 s51, s51, 0xffff +; GCN-HSA-NEXT: s_and_b32 s50, s50, 0xffff ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 @@ -3160,10 +3143,10 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 @@ -3173,47 +3156,47 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s59 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 @@ -3246,9 +3229,9 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3283,173 +3266,160 @@ ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, 0xffff +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s37, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s37, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s36, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s36, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s39, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s39, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s38, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s38, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s41, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s41, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s40, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s40, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s43, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s43, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s42, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s42, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s45, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s37, s45, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s44, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s39, s44, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s47, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s41, s47, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s46, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s43, s46, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s49, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s45, s49, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s48, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s47, s48, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s51, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s49, s51, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s50, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s50, s50, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s26 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s14, s26 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s69, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s52, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s54, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s3, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s56, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s58, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s56 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s52 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s29, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s28, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s27, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s26, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s44 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s25, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s24, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s40 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s23, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s22, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s20, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4865,33 +4835,29 @@ ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -4899,8 +4865,8 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s2, s6 -; GCN-HSA-NEXT: s_and_b32 s2, s3, s6 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s3, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 @@ -4909,7 +4875,7 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -4918,19 +4884,18 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s5, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s4, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s4, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 @@ -5110,59 +5075,55 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s8, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s9, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s8, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 -; GCN-HSA-NEXT: s_and_b32 s3, s7, s8 +; GCN-HSA-NEXT: s_lshr_b32 s9, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s7, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 @@ -5176,17 +5137,17 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5194,36 +5155,35 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s4, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s5, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s6, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s7, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s7, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s6, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -5485,39 +5445,36 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s14 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 @@ -5529,47 +5486,46 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 -; GCN-HSA-NEXT: s_and_b32 s3, s9, s12 +; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s9, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 @@ -5583,45 +5539,45 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5629,56 +5585,55 @@ ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s4, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s5, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s6, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s7, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s9, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s11, s12 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s10, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s8, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s7, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s6, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -6132,362 +6087,357 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s15, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s0, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s2, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s4, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s6, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s8, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s12, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s14, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s1, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s18 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s20, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s21, s4, s20 -; GCN-HSA-NEXT: s_and_b32 s22, s6, s20 -; GCN-HSA-NEXT: s_and_b32 s23, s8, s20 -; GCN-HSA-NEXT: s_and_b32 s24, s10, s20 -; GCN-HSA-NEXT: s_and_b32 s25, s12, s20 -; GCN-HSA-NEXT: s_and_b32 s26, s14, s20 -; GCN-HSA-NEXT: s_and_b32 s27, s16, s20 -; GCN-HSA-NEXT: s_and_b32 s28, s18, s20 -; GCN-HSA-NEXT: s_and_b32 s29, s5, s20 -; GCN-HSA-NEXT: s_and_b32 s30, s7, s20 -; GCN-HSA-NEXT: s_and_b32 s31, s9, s20 -; GCN-HSA-NEXT: s_and_b32 s33, s11, s20 -; GCN-HSA-NEXT: s_and_b32 s34, s13, s20 -; GCN-HSA-NEXT: s_and_b32 s35, s15, s20 -; GCN-HSA-NEXT: s_and_b32 s36, s17, s20 -; GCN-HSA-NEXT: s_and_b32 s20, s19, s20 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, 0xffff +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s4, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s5, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s6, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s7, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s8, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s9, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s10, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s11, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s12, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s13, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s14, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s15, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s16, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s17, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s18, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s19, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s15, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s14, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s12, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s11, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s10, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s9, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s8, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s7, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s6, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s5, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s3, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s2, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3621,24 +3621,21 @@ ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s6, 0x50 -; GCN-HSA-NEXT: s_movk_i32 s7, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x70 -; GCN-HSA-NEXT: s_mov_b32 s9, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s14, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 @@ -3646,19 +3643,19 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s2, 48 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] @@ -3668,129 +3665,130 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xb0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] -; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] -; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s14, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s14, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s9, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s9, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: s_waitcnt vmcnt(9) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s14, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s14, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GCN-HSA-NEXT: s_waitcnt vmcnt(8) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v34 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v28 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v29 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v30 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v20 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v22 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s14, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s14, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s9, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s9, v16 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s14, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s14, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v19 @@ -3798,8 +3796,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v18 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s14, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s14, v18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 @@ -4403,21 +4401,18 @@ ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s9, 0x70 -; GCN-HSA-NEXT: s_movk_i32 s10, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x50 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_add_u32 s4, s2, s9 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s10 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[2:3] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -4479,50 +4474,50 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 +; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 @@ -4532,7 +4527,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 @@ -4541,7 +4536,7 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s9 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 @@ -4554,7 +4549,7 @@ ; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29 @@ -6592,8 +6587,7 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 @@ -6601,49 +6595,52 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s6, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s6, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -7509,70 +7506,74 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[2:5] +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[2:5] -; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s16, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v13, s16, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -40,7 +40,6 @@ ; GCN-LABEL: {{^}}madak_2_use_f32: ; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 @@ -50,10 +49,10 @@ ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GFX10-MAD-DAG:v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; FMA-DAG: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] -; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] +; FMA-DAG: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000 ; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -228,7 +228,9 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-DAG: s_and_b32 [[A16:s[0-9]+]], [[A]], 0xffff +; SI-DAG: s_and_b32 [[B16:s[0-9]+]], [[B]], 0xffff +; SI: s_max_u32 [[MAX:s[0-9]+]], [[A16]], [[B16]] ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI: buffer_store_dword [[VMAX]] diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -73,7 +73,7 @@ ; FUNC-LABEL: {{^}}mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; SI-DAG: s_mul_i32 +; SI-DAG: s_mulk_i32 ; SI-DAG: v_mul_hi_i32 ; VI: v_mad_i64_i32 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -12,11 +12,10 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, s2 -; SI-NEXT: s_and_b32 s2, s5, s2 -; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_and_b32 s2, s4, 0xffffff +; SI-NEXT: s_and_b32 s4, s5, 0xffffff +; SI-NEXT: s_mul_i32 s4, s2, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -26,12 +25,11 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s2, 0xffffff ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, s2 -; VI-NEXT: s_and_b32 s1, s1, s2 +; VI-NEXT: s_and_b32 s0, s0, 0xffffff +; VI-NEXT: s_and_b32 s1, s1, 0xffffff ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -41,13 +39,12 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -417,13 +414,12 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -479,12 +475,11 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s5, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -507,12 +502,11 @@ ; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s8, 0xffffff ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b32 s4, s6, s8 +; SI-NEXT: s_and_b32 s4, s6, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s5, s7, s8 +; SI-NEXT: s_and_b32 s5, s7, 0xffffff ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_mul_i32 s4, s4, s5 ; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0 @@ -545,14 +539,13 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s5, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -652,12 +645,11 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0xffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, s2 -; SI-NEXT: s_and_b32 s2, s5, s2 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s2, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -667,12 +659,11 @@ ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, s2 -; VI-NEXT: s_and_b32 s1, s1, s2 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_and_b32 s1, s1, 0xffff ; VI-NEXT: s_mul_i32 s0, s0, s1 ; VI-NEXT: s_lshr_b32 s0, s0, 16 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -683,12 +674,11 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 0xffff +; GFX9-NEXT: s_and_b32 s1, s5, 0xffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm @@ -710,15 +700,14 @@ ; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s3, s2, s1 -; SI-NEXT: s_and_b32 s1, s0, s1 +; SI-NEXT: s_and_b32 s1, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s0, 0xffffff ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 -; SI-NEXT: s_mul_i32 s3, s3, s1 +; SI-NEXT: s_mul_i32 s1, s1, s3 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -742,14 +731,13 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s2, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s2, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -800,13 +788,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -80,8 +80,8 @@ ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -565,8 +565,7 @@ } ; GCN-LABEL: {{^}}fneg_v2f32_scalar: -; GCN: s_brev_b32 [[SIGN:s[0-9]+]], 1 -; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGN]] +; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) { %fneg = fsub <2 x float> , %x store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -16,11 +16,9 @@ ret void } -; FIXME: This should be folded with any number of uses. ; SI-LABEL: {{^}}s_addk_i32_k0_x2: -; SI: s_movk_i32 [[K:s[0-9]+]], 0x41 -; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] -; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -485,10 +485,9 @@ } ; GCN-LABEL: {{^}}phi_imm_in_sgprs -; GCN: s_movk_i32 [[A:s[0-9]+]], 0x400 ; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400 ; GCN: [[LOOP_LABEL:.L[0-9a-zA-Z_]+]]: -; GCN: s_xor_b32 [[B]], [[B]], [[A]] +; GCN: s_xor_b32 [[B]], [[B]], 0x400 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]] define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -48,9 +48,8 @@ ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} -; This constant isn't folded, because it has multiple uses. ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 -; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] +; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -21,79 +21,78 @@ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s12, s3, 31 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: s_add_u32 s2, s2, s12 ; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 @@ -188,30 +187,29 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GCN-IR-NEXT: s_add_u32 s20, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_add_u32 s17, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s10, s8, s16 -; GCN-IR-NEXT: s_mov_b32 s17, s15 -; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 +; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[18:19], 1 +; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s17, s18 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s19 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s18, s16, s18 +; GCN-IR-NEXT: s_and_b64 s[22:23], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s18, s18, s22 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_subb_u32 s19, s17, s19 +; GCN-IR-NEXT: s_subb_u32 s19, s19, s23 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 @@ -261,7 +259,6 @@ ; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GCN-NEXT: v_trunc_f32_e32 v6, v6 @@ -277,7 +274,7 @@ ; GCN-NEXT: v_mul_lo_u32 v11, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v5, v10 ; GCN-NEXT: v_mul_hi_u32 v13, v5, v9 -; GCN-NEXT: v_mul_hi_u32 v15, v6, v9 +; GCN-NEXT: v_mul_hi_u32 v14, v6, v9 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v9 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc @@ -285,7 +282,7 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v15, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -307,7 +304,7 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v6, v8 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -328,7 +325,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v1, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 @@ -402,12 +399,10 @@ ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v17 ; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 @@ -428,10 +423,10 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v0 ; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, v17 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v16, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while @@ -1046,30 +1041,29 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GCN-IR-NEXT: s_add_u32 s20, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_add_u32 s17, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s10, s8, s16 -; GCN-IR-NEXT: s_mov_b32 s17, s15 -; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 +; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[18:19], 1 +; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 ; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s17, s18 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s19 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s18, s16, s18 +; GCN-IR-NEXT: s_and_b64 s[22:23], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s18, s18, s22 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_subb_u32 s19, s17, s19 +; GCN-IR-NEXT: s_subb_u32 s19, s19, s23 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 @@ -1125,58 +1119,57 @@ ; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1328,7 +1321,6 @@ ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1344,7 +1336,7 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc @@ -1352,7 +1344,7 @@ ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1374,7 +1366,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -1435,25 +1427,24 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[5:6] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1464,10 +1455,10 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while @@ -1501,12 +1492,12 @@ ; GCN-IR-NEXT: .LBB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 ; GCN-IR-NEXT: .LBB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1529,7 +1520,6 @@ ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 @@ -1546,7 +1536,7 @@ ; GCN-NEXT: v_mul_lo_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc @@ -1554,7 +1544,7 @@ ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1576,7 +1566,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -1637,27 +1627,26 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v7, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[5:6] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1669,10 +1658,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while @@ -1706,12 +1695,12 @@ ; GCN-IR-NEXT: .LBB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 ; GCN-IR-NEXT: .LBB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -516,13 +516,15 @@ ; NOSDWA-NOT: v_or_b32_sdwa ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -156,12 +156,12 @@ ; FUNC-LABEL: {{^}}cmp_zext_k_i8max: ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff -; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] -; SI: s_cmp_lg_u32 [[B]], [[K255]] +; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff +; SI: s_cmpk_lg_i32 [[B]], 0xff ; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0 -; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] +; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff +; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff ; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]] ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]] @@ -208,9 +208,8 @@ ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff -; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] -; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}} +; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], 0xff +; GCN: s_cmpk_lg_i32 [[B]], 0xff{{$}} ; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_byte [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -149,9 +149,9 @@ ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, 0x41 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -23,18 +23,17 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c ; VI-NEXT: s_load_dword s0, s[0:1], 0x30 -; VI-NEXT: s_mov_b32 s1, 0xffff ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s3, s2, s1 +; VI-NEXT: s_and_b32 s1, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_lshr_b32 s8, s0, 16 -; VI-NEXT: s_lshl_b32 s2, s2, s8 -; VI-NEXT: s_lshl_b32 s0, s3, s0 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_and_b32 s0, s0, s1 -; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshr_b32 s3, s0, 16 +; VI-NEXT: s_lshl_b32 s2, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s1, s0 +; VI-NEXT: s_lshl_b32 s1, s2, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1822,18 +1822,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0xc400 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0xc400, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1895,18 +1894,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0x4400 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0x4400, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1968,18 +1966,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0x4000 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0x4000, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -2041,18 +2038,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0xc000 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0xc000, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,14 +31,13 @@ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (load (s64) from %ir.40, addrspace 4) - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc + ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (load (s128) from %ir.84, addrspace 4) @@ -195,7 +194,7 @@ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (load (s128) from %ir.230, addrspace 4) ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (load (s128) from %ir.236, addrspace 4) - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; CHECK-NEXT: [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32)) @@ -211,7 +210,7 @@ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (load (s64) from %ir.320, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) @@ -231,7 +230,7 @@ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] - ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] ; CHECK-NEXT: [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -17,72 +17,71 @@ ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 @@ -131,13 +130,13 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s6 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s7 ; GCN-IR-NEXT: s_min_u32 s8, s12, s8 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 +; GCN-IR-NEXT: s_min_u32 s12, s9, s10 ; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 ; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[10:11], 63 @@ -163,9 +162,8 @@ ; GCN-IR-NEXT: s_add_u32 s16, s0, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] -; GCN-IR-NEXT: s_mov_b32 s13, s9 ; GCN-IR-NEXT: s_add_u32 s8, s2, s12 -; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 +; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -238,7 +236,6 @@ ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -254,7 +251,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -262,7 +259,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -284,7 +281,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -305,7 +302,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v1, v5 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 @@ -358,103 +355,101 @@ ; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v2, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v3, v6, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v2 +; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 +; GCN-IR-NEXT: v_min_u32_e32 v10, v6, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v0 +; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 +; GCN-IR-NEXT: v_min_u32_e32 v11, v6, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[6:7], v10, v11 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v5 -; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v6 -; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v0 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v1 -; GCN-IR-NEXT: v_min_u32_e32 v12, v7, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v3, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 -; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[6:7] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v0, 0, s[6:7] +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[7:8] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[7:8] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8] -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v7 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[7:8] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v6, vcc -; GCN-IR-NEXT: v_not_b32_e32 v3, v3 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, v11 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v8, v10 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v8, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v3, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v14, v3 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v3 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11 -; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc -; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 -; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v6 -; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v3, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5] +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v13, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v14, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v18, v14, v3 +; GCN-IR-NEXT: v_and_b32_e32 v19, v14, v2 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v19 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v18, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 ; GCN-IR-NEXT: .LBB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v5, v10 -; GCN-IR-NEXT: v_mul_hi_u32 v7, v5, v9 -; GCN-IR-NEXT: v_mul_lo_u32 v6, v6, v9 -; GCN-IR-NEXT: v_mul_lo_u32 v5, v5, v9 +; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v9 +; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %x, %y ret i64 %result @@ -868,29 +863,28 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], 31 -; GCN-NEXT: s_ashr_i64 s[4:5], s[0:1], 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 ; GCN-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-NEXT: s_add_u32 s4, s4, s0 +; GCN-NEXT: s_add_u32 s8, s8, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s5, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[0:1] +; GCN-NEXT: s_addc_u32 s9, s9, s0 +; GCN-NEXT: s_xor_b64 s[12:13], s[8:9], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GCN-NEXT: s_sub_u32 s0, 0, s12 ; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_ashr_i32 s10, s11, 31 +; GCN-NEXT: s_ashr_i32 s6, s7, 31 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: s_mov_b32 s5, s9 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -905,16 +899,16 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -925,25 +919,25 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s0, s2, s10 +; GCN-NEXT: s_add_u32 s0, s2, s6 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s1, s3, s10 +; GCN-NEXT: s_addc_u32 s1, s3, s6 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] +; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] ; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 ; GCN-NEXT: v_mul_hi_u32 v4, s14, v1 @@ -955,7 +949,7 @@ ; GCN-NEXT: v_mul_hi_u32 v0, s15, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 @@ -993,12 +987,12 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem33_64: @@ -1023,13 +1017,13 @@ ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[12:13] ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s9 -; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s3 ; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s14, s12, s13 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s2 +; GCN-IR-NEXT: s_add_i32 s11, s11, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3 +; GCN-IR-NEXT: s_min_u32 s14, s11, s12 ; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[12:13], 63 @@ -1055,9 +1049,8 @@ ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 ; GCN-IR-NEXT: s_add_u32 s10, s6, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s7, s11 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while @@ -1185,13 +1178,13 @@ ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[12:13] ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s12, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_add_i32 s12, s12, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s13, s3 ; GCN-IR-NEXT: s_min_u32 s10, s10, s11 -; GCN-IR-NEXT: s_min_u32 s14, s12, s13 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s2 +; GCN-IR-NEXT: s_add_i32 s11, s11, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s3 +; GCN-IR-NEXT: s_min_u32 s14, s11, s12 ; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[12:13], 63 @@ -1217,9 +1210,8 @@ ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 ; GCN-IR-NEXT: s_add_u32 s10, s8, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s9, s11 +; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while @@ -1302,58 +1294,57 @@ ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1501,7 +1492,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1517,7 +1507,7 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc @@ -1525,7 +1515,7 @@ ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1547,7 +1537,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1603,24 +1593,23 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[3:4] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1631,10 +1620,10 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB11_3: ; %udiv-do-while @@ -1667,17 +1656,17 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 ; GCN-IR-NEXT: .LBB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1700,7 +1689,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 @@ -1717,7 +1705,7 @@ ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc @@ -1725,7 +1713,7 @@ ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1747,7 +1735,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1803,26 +1791,25 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[3:4] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1834,10 +1821,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB12_3: ; %udiv-do-while @@ -1870,15 +1857,15 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -187,12 +187,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX10-NEXT: v_add_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_add_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -134,14 +134,13 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 ; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v0, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -187,12 +187,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mul_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -207,12 +207,11 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_sub_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v5, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v5, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -33,11 +33,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v0, s4 +; GFX10-NEXT: v_min_u16 v0, 0xff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -500,7 +500,6 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -510,8 +509,8 @@ ; GFX1030-NEXT: v_sub_nc_u32_e32 v8, 0, v3 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX1030-NEXT: v_mul_f32_e32 v5, s0, v5 -; GFX1030-NEXT: v_mul_f32_e32 v6, s0, v6 +; GFX1030-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; GFX1030-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX1030-NEXT: v_mul_lo_u32 v7, v7, v5 @@ -891,7 +890,6 @@ ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 @@ -909,10 +907,10 @@ ; GFX1030-NEXT: v_sub_nc_u32_e32 v14, 0, v1 ; GFX1030-NEXT: v_sub_nc_u32_e32 v15, 0, v2 ; GFX1030-NEXT: v_sub_nc_u32_e32 v16, 0, v3 -; GFX1030-NEXT: v_mul_f32_e32 v9, s0, v9 -; GFX1030-NEXT: v_mul_f32_e32 v10, s0, v10 -; GFX1030-NEXT: v_mul_f32_e32 v11, s0, v11 -; GFX1030-NEXT: v_mul_f32_e32 v12, s0, v12 +; GFX1030-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GFX1030-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GFX1030-NEXT: v_mul_f32_e32 v11, 0x4f7ffffe, v11 +; GFX1030-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v11, v11 @@ -2142,16 +2140,15 @@ ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] -; GFX1030-NEXT: s_mov_b32 s0, 0x1389c755 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mul_hi_u32 v0, v0, s0 -; GFX1030-NEXT: v_mul_hi_u32 v1, v1, s0 -; GFX1030-NEXT: v_mul_hi_u32 v2, v2, s0 -; GFX1030-NEXT: v_mul_hi_u32 v3, v3, s0 +; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0 +; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1 +; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2 +; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2 @@ -2487,7 +2484,6 @@ ; SI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; SI-NEXT: v_rcp_f32_e32 v2, v2 ; SI-NEXT: s_mov_b32 s4, 0xfffe7960 -; SI-NEXT: v_mov_b32_e32 v9, 0 ; SI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2499,18 +2495,18 @@ ; SI-NEXT: v_mul_lo_u32 v6, v2, s4 ; SI-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; SI-NEXT: v_mul_lo_u32 v5, v2, v4 -; SI-NEXT: v_mul_hi_u32 v7, v2, v6 +; SI-NEXT: v_mul_hi_u32 v5, v2, v6 +; SI-NEXT: v_mul_lo_u32 v7, v2, v4 ; SI-NEXT: v_mul_hi_u32 v8, v2, v4 -; SI-NEXT: v_mul_hi_u32 v10, v3, v4 +; SI-NEXT: v_mul_hi_u32 v9, v3, v4 ; SI-NEXT: v_mul_lo_u32 v4, v3, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; SI-NEXT: v_mul_lo_u32 v8, v3, v6 ; SI-NEXT: v_mul_hi_u32 v6, v3, v6 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v9, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2524,7 +2520,7 @@ ; SI-NEXT: v_mul_lo_u32 v5, v2, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v6 ; SI-NEXT: v_mul_hi_u32 v8, v2, v4 -; SI-NEXT: v_mul_hi_u32 v10, v3, v4 +; SI-NEXT: v_mul_hi_u32 v9, v3, v4 ; SI-NEXT: v_mul_lo_u32 v4, v3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc @@ -2532,7 +2528,7 @@ ; SI-NEXT: v_mul_hi_u32 v6, v3, v6 ; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v9, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2548,18 +2544,18 @@ ; SI-NEXT: v_mul_hi_u32 v2, v1, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v9, vcc +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; SI-NEXT: v_mul_lo_u32 v4, v3, s4 ; SI-NEXT: v_mul_hi_u32 v5, v2, s4 ; SI-NEXT: v_mul_lo_u32 v6, v2, s4 +; SI-NEXT: s_mov_b32 s4, 0x1869f ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; SI-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 +; SI-NEXT: v_subrev_i32_e32 v4, vcc, 0x186a0, v0 ; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc -; SI-NEXT: s_mov_b32 s4, 0x1869f ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2587,7 +2583,6 @@ ; VI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; VI-NEXT: v_rcp_f32_e32 v2, v2 ; VI-NEXT: s_mov_b32 s6, 0xfffe7960 -; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; VI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; VI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2598,33 +2593,32 @@ ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v3 -; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_mul_hi_u32 v8, v6, v2 +; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: s_mov_b32 s6, 0x186a0 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_mul_hi_u32 v8, v6, v2 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 @@ -2637,16 +2631,17 @@ ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; VI-NEXT: v_mul_lo_u32 v6, v5, s6 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 +; VI-NEXT: s_mov_b32 s4, 0x186a0 +; VI-NEXT: v_mul_lo_u32 v6, v5, s4 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0 ; VI-NEXT: s_mov_b32 s4, 0x1869f ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0 ; VI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc @@ -2675,7 +2670,6 @@ ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_mov_b32 s6, 0xfffe7960 -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -2686,33 +2680,32 @@ ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 ; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_add_u32_e32 v5, vcc, v4, v3 -; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 +; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: s_mov_b32 s6, 0x186a0 ; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 ; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, v6, v2 @@ -2725,16 +2718,17 @@ ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v5, s6 -; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 +; GCN-NEXT: s_mov_b32 s4, 0x186a0 +; GCN-NEXT: v_mul_lo_u32 v6, v5, s4 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0 ; GCN-NEXT: s_mov_b32 s4, 0x1869f ; GCN-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 +; GCN-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc @@ -2761,15 +2755,14 @@ ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-NEXT: s_mov_b32 s4, 0x346d900 -; GFX1030-NEXT: s_mov_b32 s5, 0xfffe7960 ; GFX1030-NEXT: s_add_u32 s4, 0x4237, s4 -; GFX1030-NEXT: s_addc_u32 s6, 0, 0 +; GFX1030-NEXT: s_addc_u32 s5, 0, 0 ; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4 ; GFX1030-NEXT: s_cmpk_lg_u32 s4, 0x0 -; GFX1030-NEXT: s_addc_u32 s4, s6, 0xa7c5 -; GFX1030-NEXT: v_mul_hi_u32 v3, v2, s5 -; GFX1030-NEXT: v_mul_lo_u32 v4, v2, s5 -; GFX1030-NEXT: s_mul_i32 s5, s4, s5 +; GFX1030-NEXT: s_addc_u32 s4, s5, 0xa7c5 +; GFX1030-NEXT: v_mul_hi_u32 v3, 0xfffe7960, v2 +; GFX1030-NEXT: v_mul_lo_u32 v4, 0xfffe7960, v2 +; GFX1030-NEXT: s_mul_i32 s5, s4, 0xfffe7960 ; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v2 ; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v4 ; GFX1030-NEXT: v_mul_hi_u32 v8, s4, v4 @@ -2792,7 +2785,6 @@ ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v5, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, v0, v6, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[6:7], s4, v1, v6, 0 -; GFX1030-NEXT: s_mov_b32 s4, 0x186a0 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -2800,20 +2792,19 @@ ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s5, v4, s4, 0 -; GFX1030-NEXT: v_mul_lo_u32 v6, v5, s4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, 0x186a0, v4, 0 +; GFX1030-NEXT: v_mul_lo_u32 v6, 0x186a0, v5 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1030-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 -; GFX1030-NEXT: s_mov_b32 s4, 0x1869f +; GFX1030-NEXT: v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0 ; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v2 +; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2 +; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 ; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v6, vcc_lo, v4, 2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo -; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v0 -; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 +; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -16,73 +16,72 @@ ; GCN-NEXT: s_subb_u32 s5, 0, s9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 @@ -161,30 +160,29 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s16, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 +; GCN-IR-NEXT: s_add_u32 s13, s0, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s1, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] ; GCN-IR-NEXT: s_add_u32 s6, s2, s12 -; GCN-IR-NEXT: s_mov_b32 s13, s11 -; GCN-IR-NEXT: s_addc_u32 s7, s3, s11 +; GCN-IR-NEXT: s_addc_u32 s7, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s2, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s2, s17, s13 +; GCN-IR-NEXT: s_sub_u32 s2, s13, s14 +; GCN-IR-NEXT: s_subb_u32 s2, s16, s15 ; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s2, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[10:11], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 @@ -223,7 +221,6 @@ ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -239,7 +236,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -247,7 +244,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -269,7 +266,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -285,7 +282,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v2, v5 @@ -327,35 +324,33 @@ ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7] +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[6:7] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -364,38 +359,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v8 -; GCN-IR-NEXT: v_not_b32_e32 v1, v9 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-IR-NEXT: v_not_b32_e32 v1, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v13, v8, v3 -; GCN-IR-NEXT: v_and_b32_e32 v12, v8, v2 +; GCN-IR-NEXT: v_and_b32_e32 v14, v8, v3 +; GCN-IR-NEXT: v_and_b32_e32 v15, v8, v2 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v0 ; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, v8 -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5] +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] @@ -687,52 +682,49 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[0:1], 0xd ; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xff000000 -; GCN-NEXT: s_mov_b32 s6, 0xffff -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s6 +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, 0xffff +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dword s0, s[0:1], 0xc +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s2, s7 -; GCN-NEXT: s_and_b32 s3, s3, s6 +; GCN-NEXT: s_and_b32 s2, s2, 0xff000000 +; GCN-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s8, s[0:1], 0xb -; GCN-NEXT: s_load_dword s0, s[0:1], 0xc -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_and_b32 s8, s0, 0xffff +; GCN-NEXT: s_and_b32 s9, s6, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 ; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s0, s6 -; GCN-NEXT: s_and_b32 s8, s8, s7 -; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 +; GCN-NEXT: s_sub_u32 s0, 0, s0 +; GCN-NEXT: s_subb_u32 s1, 0, s1 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: s_sub_u32 s0, 0, s0 -; GCN-NEXT: s_subb_u32 s1, 0, s1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v1 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s0, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v8, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -744,32 +736,32 @@ ; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 +; GCN-NEXT: v_alignbit_b32 v3, s8, v3, 24 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 ; GCN-NEXT: v_mul_hi_u32 v2, v3, v2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -809,13 +801,11 @@ ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc ; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd ; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s8, 0xffff -; GCN-IR-NEXT: s_mov_b32 s9, 0xff000000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s3, s8 -; GCN-IR-NEXT: s_and_b32 s0, s2, s9 -; GCN-IR-NEXT: s_and_b32 s3, s7, s8 -; GCN-IR-NEXT: s_and_b32 s2, s6, s9 +; GCN-IR-NEXT: s_and_b32 s0, s2, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s1, s3, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s6, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s3, s7, 0xffff ; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[0:1], 24 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 @@ -852,30 +842,29 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s16, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 +; GCN-IR-NEXT: s_add_u32 s13, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s3, -1 ; GCN-IR-NEXT: s_not_b64 s[0:1], s[10:11] ; GCN-IR-NEXT: s_add_u32 s6, s0, s12 -; GCN-IR-NEXT: s_mov_b32 s13, s11 -; GCN-IR-NEXT: s_addc_u32 s7, s1, s11 +; GCN-IR-NEXT: s_addc_u32 s7, s1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s1, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 ; GCN-IR-NEXT: s_lshr_b32 s0, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[0:1] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s0, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s0, s17, s13 +; GCN-IR-NEXT: s_sub_u32 s0, s13, s14 +; GCN-IR-NEXT: s_subb_u32 s0, s16, s15 ; GCN-IR-NEXT: s_ashr_i32 s10, s0, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s0, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[10:11], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 @@ -920,58 +909,57 @@ ; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1108,7 +1096,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1124,7 +1111,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v8, v2, v9 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc @@ -1132,7 +1119,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1154,7 +1141,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1205,22 +1192,21 @@ ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1232,10 +1218,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while @@ -1269,12 +1255,12 @@ ; GCN-IR-NEXT: .LBB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 ; GCN-IR-NEXT: .LBB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result @@ -1375,8 +1361,8 @@ ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -1385,25 +1371,24 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -1417,7 +1402,7 @@ ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -1425,7 +1410,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -1441,7 +1426,7 @@ ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v1, 24 @@ -1560,7 +1545,6 @@ ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1572,18 +1556,18 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v2, s4 ; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v2, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v5, v2, v6 +; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1596,7 +1580,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc @@ -1604,7 +1588,7 @@ ; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1620,7 +1604,7 @@ ; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -346,9 +346,8 @@ } ; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: -; SI-DAG: v_rcp_iflag_f32 -; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], +; SI: v_rcp_iflag_f32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, ; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -365,9 +364,8 @@ } ; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: -; SI-DAG: v_rcp_iflag_f32 -; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], +; SI: v_rcp_iflag_f32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, ; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -17,72 +17,71 @@ ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 @@ -131,13 +130,13 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s6 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_add_i32 s9, s9, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s7 ; GCN-IR-NEXT: s_min_u32 s8, s12, s8 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 +; GCN-IR-NEXT: s_min_u32 s12, s9, s10 ; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 ; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[10:11], 63 @@ -163,9 +162,8 @@ ; GCN-IR-NEXT: s_add_u32 s16, s0, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] -; GCN-IR-NEXT: s_mov_b32 s13, s9 ; GCN-IR-NEXT: s_add_u32 s8, s2, s12 -; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 +; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -233,7 +231,6 @@ ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -249,7 +246,7 @@ ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -257,7 +254,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -279,7 +276,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -295,7 +292,7 @@ ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 @@ -336,35 +333,33 @@ ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v5, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[5:6] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[5:6] +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[5:6] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -376,37 +371,37 @@ ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v6, v8 -; GCN-IR-NEXT: v_not_b32_e32 v7, v9 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_not_b32_e32 v7, 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v16, v10, v3 -; GCN-IR-NEXT: v_and_b32_e32 v17, v10, v2 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v10 -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v3 +; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v2 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -745,58 +740,57 @@ ; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -933,8 +927,8 @@ ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -943,63 +937,62 @@ ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 ; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 @@ -1133,7 +1126,6 @@ ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1149,7 +1141,7 @@ ; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v8, v2, v9 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc @@ -1157,7 +1149,7 @@ ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1179,7 +1171,7 @@ ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1225,26 +1217,25 @@ ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffd0, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[2:3] -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[3:4] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1256,10 +1247,10 @@ ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while @@ -1292,15 +1283,15 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -31,9 +31,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -173,13 +173,12 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GISEL-NEXT: s_movk_i32 s0, 0x7fff -; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 -; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GISEL-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GISEL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -232,7 +231,7 @@ ; GISEL-NEXT: s_mov_b32 s0, 0x8000 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_add_f16_e64 v0, s0, -v0 +; GISEL-NEXT: v_add_f16_e64 v0, 0x8000, -v0 ; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -968,13 +968,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v1 -; GFX10-NEXT: v_and_b32_e32 v3, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1385,7 +1384,7 @@ ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -190,8 +190,8 @@ ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -51,9 +51,8 @@ ; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] -; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] +; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}} +; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}} ; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]] ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]